In [None]:
import os
import gc
import numpy as np
import pandas as pd

from time import time
from time import ctime

import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm_notebook
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import lightgbm as lgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

def plotfig (ypred, yactual, strtitle, y_max):
    plt.scatter(ypred, yactual.values.ravel())
    plt.title(strtitle)
    plt.plot([(0, 0), (y_max, y_max)], [(0, 0), (y_max, y_max)])
    plt.xlim(0, y_max)
    plt.ylim(0, y_max)
    plt.xlabel('Predicted', fontsize=12)
    plt.ylabel('Actual', fontsize=12)
    plt.show()

In [None]:
# Alexander Lyubchenko - INGV_TSFresh_7730
train = pd.read_csv('../input/ingv-tsfresh-7730/train.csv', sep = ';')
train.set_index('Unnamed: 0', inplace = True)
test = pd.read_csv('../input/ingv-tsfresh-7730/test.csv', sep = ';')
test.set_index('Unnamed: 0', inplace = True)

In [None]:
train.info()

In [None]:
test.head()

In [None]:
test_index = test.index
train_rf = train.copy()
train_rf = train_rf.fillna(0)

In [None]:
train_rf.head()

In [None]:
x = train_rf.drop('time_to_eruption', axis=1)
y = train_rf['time_to_eruption']
# Entrainons un modèle simple afin d'estimer l'importance des différentes colonnes
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=1, max_depth=7)
model.fit(x,y)

In [None]:
# Calculons le score d'importance de chaque colonne et trions le du plus important au moins
feature_scores = pd.Series(model.feature_importances_, index=x.columns).sort_values(ascending=False)
feature_scores

In [None]:
# On garde les 350 features les plus importantes (ancienne méthode)
selected_feature = feature_scores[:350].index
selected_feature

In [None]:
selected_feature = feature_scores.loc[feature_scores >= 0.001].index
selected_feature

In [None]:
target = train['time_to_eruption']
# Rassemblons les datasets
all_data = pd.concat([train, test], ignore_index = True)
all_data.head()

In [None]:
# On re-définit notre dataset en gardant seulement les features les plus importantes + la colonne time_to_eruption
all_data = pd.concat([all_data[selected_feature], all_data['time_to_eruption']], axis=1)
all_data.head()

In [None]:
# Fonction pour calculer le taux de valeurs manquantes par colonne
def missing_values_table(df):
        # Total de valeurs manquantes
        mis_val = df.isnull().sum()
        
        # Pourcentage de valeurs manquantes
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Construire un tableau
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Renommer les colonnes
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Tri du tableau du plus de valeurs manquantes au moins
        mis_val_table_ren_columns = mis_val_table_ren_columns[mis_val_table_ren_columns.iloc[:,1] != 0].sort_values('% of Total Values', ascending=False).round(1)
        
        
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"+"There are " + str(mis_val_table_ren_columns.shape[0]) + " columns that have missing values.")
        
        return mis_val_table_ren_columns

In [None]:
# Appel de la méthode missing_values_table()
missing_values = missing_values_table(all_data)
missing_values

In [None]:
# fillna() permet de remplir les valeurs manquantes
# .model() permet d'obtenir un set des valeurs les plus présentes dans le dataset
all_data = all_data.fillna(all_data.mode())  

In [None]:
# On adapte le scaler à la data puis on la scale
header = all_data.columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
all_data[header] = scaler.fit_transform(all_data)
all_data.head()

In [None]:
all_data = all_data.drop('time_to_eruption', axis=1)
all_data.var()

In [None]:
all_data.corr()

In [None]:
missing_values = missing_values_table(all_data)
missing_values

In [None]:
all_data = all_data.fillna(all_data.min())

In [None]:
# Analyse de composantes principales (réduction du nb de dimensions)
from sklearn.decomposition import PCA
pca = PCA() 
all_data_pca = pca.fit_transform(all_data)
all_data_pca = pd.DataFrame(all_data_pca)
all_data_pca.head()

In [None]:
train = all_data_pca[:train.shape[0]]
test = all_data_pca[train.shape[0]:]

In [None]:
Y = target
X = train
grid_params = {
    'num_leaves': [24, 25, 26], #[7, 20, 30 ,50], [15, 20, 25]
    'learning_rate': [0.04, 0.05, 0.06], #[0.1, 0.03, 0.003], [0.05, 0.1, 0.15]
    'max_depth': [4, 5, 6], #[-1, 3, 5], [5, 7, 10]
    'n_estimators': [1000, 1500, 2000], #[50, 100, 200, 500],  [500, 700, 800, 1000]
}

#clf = GridSearchCV(lgb.LGBMRegressor(), grid_params, scoring='r2')
#clf.fit(X, Y)

#print("Best parameters set found on development set:")
#print(clf.best_params_)

## {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 500, 'num_leaves': 20}
## {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 1000, 'num_leaves': 25}
## {'learning_rate': 0.04, 'max_depth': 5, 'n_estimators': 2000, 'num_leaves': 26}

In [None]:
gbm = lgb.LGBMRegressor(learning_rate=0.04,max_depth=5,n_estimators=2000, num_leaves=26) #**clf.best_params_
gbm.fit(train, target)

In [None]:
# predict
y_pred = gbm.predict(test, num_iteration=gbm.best_iteration_)

In [None]:
submission = pd.DataFrame()
submission['segment_id'] = test_index
submission['time_to_eruption'] = y_pred
submission.to_csv('submission.csv', header=True, index=False)

In [None]:
# eval
y_target = y_test
print('The rmse of prediction is:', mean_squared_error(y_target, y_pred) ** 0.5)