# import libraries and data

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
%matplotlib inline
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVR, SVR
from sklearn.metrics import mean_absolute_error
pd.options.display.precision = 15
from sklearn.model_selection import train_test_split, KFold, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn import svm
import lightgbm as lgb
import xgboost as xgb
import time
import datetime
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import neighbors
from sklearn.metrics import mean_absolute_error
from sklearn import linear_model
import gc
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor

import scipy as sc
from scipy.signal import hilbert
from scipy.signal import hann
from scipy.signal import convolve
from scipy import stats
from sklearn.kernel_ridge import KernelRidge
from sklearn.neighbors import NearestNeighbors
import librosa, librosa.display
import builtins
from sklearn.ensemble import RandomForestRegressor
import eli5
import shap
from sklearn.feature_selection import GenericUnivariateSelect, SelectPercentile, SelectKBest, f_classif, mutual_info_classif, RFE

from IPython.display import HTML
import json
import altair as alt

import artgor_utils

import random
from joblib import Parallel, delayed

# setting up altair
workaround = artgor_utils.prepare_altair()
HTML("".join((
    "<script>",
    workaround,
    "</script>",
)))

In [None]:
import lightgbm as lgb
from tensorflow import keras
from gplearn.genetic import SymbolicRegressor
from catboost import Pool, CatBoostRegressor

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV, KFold, RandomizedSearchCV
from sklearn.feature_selection import RFECV, SelectFromModel

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import NuSVR, SVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor

In [None]:
train_X_0 = pd.read_csv("../input/lanl-masters-features-creating-0/train_X_features_865.csv")
train_X_1 = pd.read_csv("../input/lanl-masters-features-creating-1/train_X_features_865.csv")
y_0 = pd.read_csv("../input/lanl-masters-features-creating-0/train_y.csv", index_col=False,  header=None)
y_1 = pd.read_csv("../input/lanl-masters-features-creating-1/train_y.csv", index_col=False,  header=None)
X_test = pd.read_csv("../input/lanl-masters-features-creating-0/test_X_features_10.csv")
del X_test["seg_id"]


X = pd.concat([train_X_0, train_X_1], axis=0)
X = X.reset_index(drop=True)

y = pd.concat([y_0, y_1], axis=0)
y = y.reset_index(drop=True)


scaler = StandardScaler()
train_columns = X.columns

X[train_columns] = scaler.fit_transform(X[train_columns])
X_test[train_columns] = scaler.transform(X_test[train_columns])

print(X.shape)
print(y.shape)
print(X_test.shape)

In [None]:
n_fold = 5
folds = KFold(n_splits=n_fold, shuffle=True, random_state=2019)

# models

In [None]:
params = {
    'num_leaves': 128,
    'min_child_samples': 79,
    'objective': 'gamma',
    'max_depth': -1,
    'learning_rate': 0.03,
    "boosting_type": "gbdt",
    "subsample_freq": 5,
    "subsample": 0.85,
    "bagging_seed": 11,
    "metric": 'mae',
    "verbosity": -1,
    'reg_alpha': 0.13,
    'reg_lambda': 0.36,
    'colsample_bytree': 1.0
}

## 1. random forest

In [None]:
%%time
rfr = RandomForestRegressor()

# parameter_grid = {'n_estimators': [30, 50, 80, 100, 120, 160, 200], 'max_depth': [5, 10, 15]}
parameter_grid = {'n_estimators': [50, 100, 150], 'max_depth': [10]}

grid_search = GridSearchCV(rfr,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
rfr = RandomForestRegressor(**grid_search.best_params_)
result_dict_rfr = artgor_utils.train_model_regression(X,
                                                      X_test,
                                                      y,
                                                      params=params,
                                                      folds=folds,
                                                      model_type='sklearn',
                                                      model=rfr)

## 2. knn

In [None]:
%%time
knn = neighbors.KNeighborsRegressor()

# parameter_grid = {'n_neighbors': [10, 40, 60, 80, 100, 150],
#                 'weights':['uniform', 'distance']
# }
parameter_grid = {'n_neighbors': [30, 50, 80],
                 'weights':['distance']
                 }

grid_search = GridSearchCV(knn,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
knn = neighbors.KNeighborsRegressor(**grid_search.best_params_)
result_dict_knn = artgor_utils.train_model_regression(X,
                                                      X_test,
                                                      y,
                                                      params=params,
                                                      folds=folds,
                                                      model_type='sklearn',
                                                      model=knn)

## 3. extra tree

In [None]:
%%time
etr = ExtraTreesRegressor()

# parameter_grid = {
#     'n_estimators': [500, 700, 1000, 1300],
#     'max_depth': [5, 10, 15]
# }
parameter_grid = {
    'n_estimators': [500, 1000],
    'max_depth': [10]
}

grid_search = GridSearchCV(rfr,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
etr = ExtraTreesRegressor(**grid_search.best_params_)
result_dict_etr = artgor_utils.train_model_regression(X,
                                                      X_test,
                                                      y,
                                                      params=params,
                                                      folds=folds,
                                                      model_type='sklearn',
                                                      model=etr)

## 4. ada boost

In [None]:
%%time
adr = AdaBoostRegressor()

# parameter_grid = {
#     'n_estimators': [10, 50, 80, 100, 200],
#      'learning_rate':[0.01, 0.03, 0.1, 0.3]
# }

parameter_grid = {
    'n_estimators': [50, 100],
    'learning_rate':[0.03]
}

grid_search = GridSearchCV(adr,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
adr = AdaBoostRegressor(**grid_search.best_params_)
result_dict_adr = artgor_utils.train_model_regression(X,
                                                      X_test,
                                                      y,
                                                      params=params,
                                                      folds=folds,
                                                      model_type='sklearn',
                                                      model=adr)

## 5. NuSVR

In [None]:
%%time
nusvr = NuSVR()

# parameter_grid = {
#     'gamma': ['scale','auto'],
#     'nu': [0.5, 0.6, 0.7,0.8, 0.9],
#     'C': [1, 3, 5, 7,10],
#     'tol': [0.01, 0.003, 0.001]
# }
parameter_grid = {
    'gamma': ['auto'],
    'nu': [0.7],
    'C': [3],
    'tol': [0.001]
}

grid_search = GridSearchCV(nusvr,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
nusvr = NuSVR(**grid_search.best_params_)
result_dict_nusvr = artgor_utils.train_model_regression(X,
                                                        X_test,
                                                        y,
                                                        params=params,
                                                        folds=folds,
                                                        model_type='sklearn',
                                                        model=nusvr)

In [None]:
plt.figure(figsize=(12, 8));
scores_df = pd.DataFrame({'RandomForestRegressor': result_dict_rfr['scores']})
scores_df['KNN'] = result_dict_knn['scores']
scores_df['ExtraTreesRegressor'] = result_dict_etr['scores']
scores_df['AdaBoostRegressor'] = result_dict_adr['scores']
scores_df['NuSVR'] = result_dict_nusvr['scores']

sns.boxplot(data=scores_df);
plt.xticks(rotation=45);

---

## 6. light gbm

In [None]:
%%time
lightgmb = lgb.LGBMRegressor()

# parameter_grid = {
#     'num_leaves': [64, 128, 256],
#     'min_child_samples': [32, 64],
#     'objective': ['gamma'],
#     'max_depth': [-1],
#     'learning_rate': [0.003, 0.01, 0.03, 0.1],
#     "boosting_type": ['gbdt'],
#     "subsample_freq": [5],
#     "subsample": [0.85],
#     "bagging_seed": [11],
#     "metric": ['mae'],
#     "verbosity": [-1],
#     'reg_alpha': [0.13],
#     'reg_lambda': [0.36],
#     'colsample_bytree': [0.2]
# }

parameter_grid = {
    'num_leaves': [128],
    'min_child_samples': [64],
    'objective': ['gamma'],
    'max_depth': [-1],
    'learning_rate': [0.03],
    "boosting_type": ['gbdt'],
    "subsample_freq": [5],
    "subsample": [0.85],
    "bagging_seed": [11],
    "metric": ['mae'],
    "verbosity": [-1],
    'reg_alpha': [0.1],
    'reg_lambda': [0.3],
    'colsample_bytree': [0.2]
}

grid_search = GridSearchCV(lightgmb,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(X, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
# lightgmb = lgb.LGBMRegressor(**grid_search.best_params_)
result_dict_lgb = artgor_utils.train_model_regression(
    X=X,
    X_test=X_test,
    y=y,
    params=grid_search.best_params_,
    folds=folds,
    model_type='lgb',
    eval_metric='mae',
    plot_feature_importance=False)

## 7. xgb

In [None]:
xgb_params = {
    'eta': 0.03,
    'max_depth': 16,
    'subsample': 0.85,
    'colsample_bytree': 0.3,
    'objective': 'gpu:reg:linear',
    'eval_metric': 'mae',
    'silent': True,
    'tree_method': 'gpu_hist'
}
result_dict_xgb = artgor_utils.train_model_regression(X=X,
                                                      X_test=X_test,
                                                      y=y,
                                                      params=xgb_params,
                                                      folds=folds,
                                                      model_type='xgb')

## 8. cat boost

In [None]:
params = {}
result_dict_cat = artgor_utils.train_model_regression(X=X,
                                                     X_test=X_test,
                                                     y=y,
                                                     params=params,
                                                     folds=folds,
                                                     model_type='cat')

---

In [None]:
submission = pd.read_csv('../input/sample_submission.csv', index_col='seg_id')
submission['time_to_failure'] = (
    result_dict_rfr['prediction'] + result_dict_knn['prediction'] +
    result_dict_etr['prediction'] + result_dict_adr['prediction'] +
    result_dict_nusvr['prediction'] + result_dict_lgb['prediction'] +
    result_dict_xgb['prediction'] + result_dict_cat['prediction']) / 8
print(submission.head())
# submission.to_csv('average_blending.csv')

# stacking

## - create new features sets

In [None]:
train_stack = np.vstack([
    result_dict_rfr['oof'], result_dict_knn['oof'], result_dict_etr['oof'],
    result_dict_adr['oof'], result_dict_nusvr['oof'], result_dict_lgb['oof'],
    result_dict_xgb['oof'], result_dict_cat['oof']
]).transpose()
train_stack = pd.DataFrame(
    train_stack,
    columns=['rfr', 'knn', 'etr', 'adr', 'nusvr', 'lgb', 'xgb', 'cat'])

test_stack = np.vstack([
    result_dict_rfr['prediction'], result_dict_knn['prediction'],
    result_dict_etr['prediction'], result_dict_adr['prediction'],
    result_dict_nusvr['prediction'], result_dict_lgb['prediction'],
    result_dict_xgb['prediction'], result_dict_cat['prediction']
]).transpose()
test_stack = pd.DataFrame(
    test_stack,
    columns=['rfr', 'knn', 'etr', 'adr', 'nusvr', 'lgb', 'xgb', 'cat'])

## - lgm stacking

In [None]:
%%time
lightgmb = lgb.LGBMRegressor()

# parameter_grid = {
#     'num_leaves': [8, 16],
#     'min_child_samples': [8, 16],
#     'objective': ['gamma'],
#     'max_depth': [-1],
#     'learning_rate': [0.01, 0.03, 0.1],
#     "boosting_type": ['gbdt'],
#     "subsample_freq": [5],
#     "subsample": [0.85],
#     "bagging_seed": [11],
#     "metric": ['mae'],
#     "verbosity": [-1],
#     'reg_alpha': [0.03, 0.1, 0.3],
# }

parameter_grid = {
    'num_leaves': [16],
    'min_child_samples': [16],
    'objective': ['gamma'],
    'max_depth': [4, 8],
    'learning_rate': [0.03],
    "boosting_type": ['gbdt'],
    "subsample_freq": [5],
    "subsample": [0.85],
    "bagging_seed": [11],
    "metric": ['mae'],
    "verbosity": [-1],
    'reg_alpha': [0.03],
}

grid_search = GridSearchCV(lightgmb,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(train_stack, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
result_dict_lgb_stack = artgor_utils.train_model_regression(
    X=train_stack,
    X_test=test_stack,
    y=y,
    params=grid_search.best_params_,
    folds=folds,
    model_type='lgb',
    eval_metric='mae',
    plot_feature_importance=True)

## - Xgboost

In [None]:
xgb_params = {
    'eta': 0.03,
    'max_depth': 4,
    'subsample': 0.85,
    'colsample_bytree': 0.3,
    'objective': 'gpu:reg:linear',
    'eval_metric': 'mae',
    'silent': True,
    'tree_method': 'gpu_hist'
}
result_dict_xgb_stack = artgor_utils.train_model_regression(X=train_stack,
                                                      X_test=test_stack,
                                                      y=y,
                                                      params=xgb_params,
                                                      folds=folds,
                                                      model_type='xgb')

## - cat boost

In [None]:
params = {}
result_dict_cat_stack = artgor_utils.train_model_regression(X=train_stack,
                                                     X_test=test_stack,
                                                     y=y,
                                                     params=params,
                                                     folds=folds,
                                                     model_type='cat')

# light gbm final stacking

## - create second features sets

In [None]:
train_stack_second = np.vstack([
    result_dict_lgb_stack['oof'], result_dict_xgb_stack['oof'], result_dict_cat_stack['oof']
]).transpose()
train_stack_second = pd.DataFrame(
    train_stack,
    columns=['lgb', 'xgb','cat'])

test_stack_second = np.vstack([
    result_dict_lgb_stack['prediction'], result_dict_xgb_stack['prediction'],
    result_dict_cat_stack['prediction']
]).transpose()
test_stack_second = pd.DataFrame(
    test_stack,
    columns=['lgb', 'xgb','cat'])

## - final lgb stacking

In [None]:
%%time
lightgmb = lgb.LGBMRegressor()

# parameter_grid = {
#     'num_leaves': [4, 8,16],
#     'min_child_samples': [8, 16],
#     'objective': ['gamma'],
#     'max_depth': [4, 8],
#     'learning_rate': [0.01, 0.03],
#     "boosting_type": ['gbdt'],
#     "subsample_freq": [5],
#     "subsample": [0.85],
#     "bagging_seed": [11],
#     "metric": ['mae'],
#     "verbosity": [-1],
#     'reg_alpha': [0.03, 0.1],
# }

parameter_grid = {
    'num_leaves': [16],
    'min_child_samples': [16],
    'objective': ['gamma'],
    'max_depth': [4],
    'learning_rate': [0.03],
    "boosting_type": ['gbdt'],
    "subsample_freq": [5],
    "subsample": [0.85],
    "bagging_seed": [11],
    "metric": ['mae'],
    "verbosity": [-1],
    'reg_alpha': [0.03],
}

grid_search = GridSearchCV(lightgmb,
                           param_grid=parameter_grid,
                           cv=folds,
                           scoring='neg_mean_absolute_error',
                           n_jobs=12)
grid_search.fit(train_stack, y)
print('Best score: {}'.format(grid_search.best_score_))
print('Best parameters: {}'.format(grid_search.best_params_))

In [None]:
# lightgmb = lgb.LGBMRegressor(**grid_search.best_params_)
result_dict_lgb_stack_final = artgor_utils.train_model_regression(
    X=train_stack_second,
    X_test=test_stack_second,
    y=y,
    params=grid_search.best_params_,
    folds=folds,
    model_type='lgb',
    eval_metric='mae',
    plot_feature_importance=True)

# submission

In [None]:
submission['time_to_failure'] = result_dict_lgb_stack_final['prediction']
print(submission.head())
submission.to_csv('good-features-second-stacking.csv')