In [5]:
import numpy as np
import pandas as pd
import os
import time
import datetime
import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, train_test_split, KFold
from sklearn.metrics import mean_absolute_error as mae

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

from catboost import CatBoostRegressor, Pool
import lightgbm as lgb
import catboost
import xgboost as xgb
print('XGB version:', xgb.__version__)

import warnings
warnings.filterwarnings("ignore")

XGB version: 0.81


In [21]:
def huber_approx_obj(preds, dtrain):
    d = preds - dtrain#.get_labels() #remove .get_labels() for sklearn
    h = 1  #h is delta in the graphic
    scale = 1 + (d / h) ** 2
    scale_sqrt = np.sqrt(scale)
    grad = d / scale_sqrt
    hess = 1 / scale / scale_sqrt
    return grad, hess

def LMAE_XGB(preds, dtrain):
    labels = dtrain.get_label()
    return 'LMAE_XGB', np.log(mae(preds, labels))

def LMAE_LGB(preds, labels):
    return 'LMAE_LGB', np.log(mae(preds, labels)), False

def LMAE(preds, labels):
    return np.log(mae(preds, labels))

def permutation_importance(model, X_val, y_val):
    results = pd.DataFrame(columns=['feature', 'permute_score'])
    y_pred = model.predict(X_val)
    base_score = LMAE(y_val, y_pred)
    print('Base score:', base_score)

    for j, col in enumerate(tqdm(X_val.columns.values)):
        freezed_col = X_val[col].copy()
        X_val[col] = np.random.permutation(X_val[col])
        preds = model.predict(X_val)
        score = LMAE(y_val, preds) - base_score
        
        X_val[col] = freezed_col
        results = results.append({'feature' : col , 'permute_score' : score} , ignore_index=True)

    return results


def permutation_importance_XGB(model, X_val, y_val, col_names):
    results = pd.DataFrame(columns=['feature', 'permute_score'])
    y_pred = model.predict(X_val)
    base_score = LMAE(y_val, y_pred)
    print('Base score:', base_score)

    for j, col in enumerate(tqdm(col_names)):
        freezed_col = X_val[:,j].copy()
        X_val[:,j] = np.random.permutation(X_val[:,j])
        preds = model.predict(X_val)
        score = LMAE(y_val, preds) - base_score
        
        X_val[:,j] = freezed_col
        results = results.append({'feature' : col , 'permute_score' : score} , ignore_index=True)

    return results

### FEATURE PERMUTATION

In [22]:
DATAPATH = 'I:/Molecule_Kaggle/features/'
DATAPATH  = '/Users/voanhkha/Desktop/Molecule_Kaggle/features/'

FEATURE_SETS = ['josh', 'kyle', 'M101', 'M113', 'kyleB', 'fc_oof_seed222']
SET_EXT = ['parquet', 'csv', 'parquet', 'parquet', 'csv', 'csv']
TYPES = [2,1,3,6]
SPLIT_SEED = 1248

CATEGORICAL_FEATURES = np.load('info_data/CATEGORICAL_FEATURES.npy')
USELESS_FEATURES = [l for l in np.load('info_data/FAIL_FEATURES.npy')]
REMOVED_FEATURES = ['id', 'y', 'molecule_id', 'atom1', 'atom2', 'bondtype']

for bondtype in TYPES:
    print('SELECTION FOR TYPE', bondtype, '................')
    
    for j, (fset, ext) in enumerate(zip(FEATURE_SETS, SET_EXT)):
        print('Loading', fset, '...')
        trainfile = fset+'/'+fset+'_train_'+str(bondtype)+'.'+ext
        if ext == 'parquet': temp_df = pd.read_parquet(DATAPATH+trainfile).reset_index(drop=True)
        elif ext == 'csv': temp_df = pd.read_csv(DATAPATH+trainfile).reset_index(drop=True)             
        if j == 0: train_df = temp_df
        else: train_df = pd.concat( [train_df, temp_df], axis=1 )
    
    ## Post process feature matrix
    # Drop duplicate column names
    print("Drop Duplicates...")
    train_df = train_df.loc[:,~train_df.columns.duplicated()]
    print("Fill na...")
    train_df.fillna(0, inplace=True)
    
    # Detect useful and categorical features
    print("Get useful features...")
    feature_names = pd.unique(train_df.columns.values)
    useful_features = [ f for f in feature_names if f not in REMOVED_FEATURES ]
    categorical_features = [ f for f in useful_features if f in CATEGORICAL_FEATURES[bondtype-1]]
    
    train_df = train_df[useful_features]

    # Label-encode categorical features
    print("Label encoding...")
    for col in tqdm(categorical_features):
        LE = LabelEncoder().fit(train_df[col])
        train_df[col] = LE.transform(train_df[col])

    #------------------------------------
    print(train_df.shape)
    y = pd.read_csv('info_data/info_train_type_'+str(bondtype)+'.csv', usecols=['scalar_coupling_constant']).values

    #----------------------------------#
    kf = KFold(n_splits=5, shuffle=True, random_state=SPLIT_SEED)
    avg_rank = np.zeros(train_df.shape[1])
    for i, (fold_index_tr, fold_index_va) in enumerate(kf.split(train_df)):
        if i>=3 : break
        
        print('\n Train fold', i, '......')
        # The below for LGB
        #X_tr, X_va = train_df.iloc[fold_index_tr, :], train_df.iloc[fold_index_va, :]
        # The below for XGB
        X_tr, X_va = train_df.iloc[fold_index_tr, :].values, train_df.iloc[fold_index_va, :].values
        y_tr, y_va = y[fold_index_tr].flatten(), y[fold_index_va].flatten()


########## XGB #########
        XGB_PARAMS = { "learning_rate":0.1, "n_estimators":2000, "max_depth":12, 
                       "min_child_weight": 40, "subsample":0.7, "objective":'reg:linear', 
                       "nthread":4, "scale_pos_weight":1, "seed":27, "base_score":np.mean(y_tr)}
        mdl = xgb.XGBRegressor(**XGB_PARAMS) 
        mdl.fit(X_tr, y_tr,  verbose=10, early_stopping_rounds=30, \
                eval_set=[(X_va, y_va)], eval_metric=LMAE_XGB)

        
############## LGB #########
#         LGB_PARAMS = {'num_leaves': 100, 'min_data_in_leaf': 50, 'objective':'huber',
#                      'max_depth': 15, 'learning_rate': 0.1, "boosting": "gbdt",
#                      "feature_fraction": 1, "bagging_fraction": 0.7,
#                      "random_state": 240691, "num_threads": 4 }
#         mdl = lgb.LGBMRegressor(**LGB_PARAMS, n_estimators = 4000, n_jobs = -1) 
#         mdl.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric=LMAE_LGB,
#                 verbose=100, early_stopping_rounds=50,
#                 feature_name=useful_features, categorical_feature=categorical_features)
        
        ########
########### CatBoost #########
#         mdl = catboost.CatBoostRegressor(loss_function='RMSE', n_estimators = 5000, eval_metric="MAE") 
#         mdl.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose_eval=50, early_stopping_rounds=50)
################################
        
        results = permutation_importance_XGB(mdl, X_va,y_va, train_df.columns.values)
        ranking = np.argsort(np.argsort(results.permute_score.values))
        avg_rank += ranking

    final_results = results.copy()
    final_results['permute_score'] = avg_rank
    final_results = final_results.sort_values('permute_score', ascending=False).reset_index(drop=True)
    final_results.to_csv('feature_ranking_xgb_type_'+str(bondtype)+'.csv', index=None)

SELECTION FOR TYPE 2 ................
Loading josh ...
Loading kyle ...
Loading M101 ...
Loading M113 ...
Loading kyleB ...
Loading fc_oof_seed222 ...
Drop Duplicates...
Fill na...
Get useful features...
Label encoding...


HBox(children=(IntProgress(value=0, max=170), HTML(value='')))

(43363, 603)

 Train fold 0 ......
[0]	validation_0-rmse:9.85524	validation_0-LMAE_XGB:2.17214
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:3.50834	validation_0-LMAE_XGB:1.1263
[20]	validation_0-rmse:1.31601	validation_0-LMAE_XGB:0.097851
[30]	validation_0-rmse:0.639616	validation_0-LMAE_XGB:-0.781478
[40]	validation_0-rmse:0.484014	validation_0-LMAE_XGB:-1.21801
[50]	validation_0-rmse:0.457976	validation_0-LMAE_XGB:-1.30443
[60]	validation_0-rmse:0.451496	validation_0-LMAE_XGB:-1.31497
[70]	validation_0-rmse:0.448904	validation_0-LMAE_XGB:-1.31388
[80]	validation_0-rmse:0.4462	validation_0-LMAE_XGB:-1.31537
[90]	validation_0-rmse:0.443835	validation_0-LMAE_XGB:-1.31444
[100]	validation_0-rmse:0.442094	validation_0-LMAE_XGB:-1.31347
Stopping. Best iteration:
[78]	validation_0-rmse:0.446762	validation_0-LMAE_XGB:-1.31558

Base score: -1.3155811

HBox(children=(IntProgress(value=0, max=603), HTML(value='')))


 Train fold 1 ......
[0]	validation_0-rmse:9.8754	validation_0-LMAE_XGB:2.17471
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:3.5392	validation_0-LMAE_XGB:1.13652
[20]	validation_0-rmse:1.3567	validation_0-LMAE_XGB:0.121657
[30]	validation_0-rmse:0.685853	validation_0-LMAE_XGB:-0.750557
[40]	validation_0-rmse:0.525456	validation_0-LMAE_XGB:-1.1958
[50]	validation_0-rmse:0.492305	validation_0-LMAE_XGB:-1.29787
[60]	validation_0-rmse:0.480825	validation_0-LMAE_XGB:-1.3165
[70]	validation_0-rmse:0.475962	validation_0-LMAE_XGB:-1.3211
[80]	validation_0-rmse:0.474297	validation_0-LMAE_XGB:-1.32037
[90]	validation_0-rmse:0.473731	validation_0-LMAE_XGB:-1.31862
[100]	validation_0-rmse:0.47289	validation_0-LMAE_XGB:-1.3185
Stopping. Best iteration:
[76]	validation_0-rmse:0.474741	validation_0-LMAE_XGB:-1.32129

Base score: -1.321292403116159


HBox(children=(IntProgress(value=0, max=603), HTML(value='')))


 Train fold 2 ......
[0]	validation_0-rmse:9.7981	validation_0-LMAE_XGB:2.16552
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:3.48147	validation_0-LMAE_XGB:1.12074
[20]	validation_0-rmse:1.30665	validation_0-LMAE_XGB:0.097201
[30]	validation_0-rmse:0.629225	validation_0-LMAE_XGB:-0.796167
[40]	validation_0-rmse:0.471489	validation_0-LMAE_XGB:-1.25383
[50]	validation_0-rmse:0.441758	validation_0-LMAE_XGB:-1.34097
[60]	validation_0-rmse:0.435336	validation_0-LMAE_XGB:-1.34679
[70]	validation_0-rmse:0.433053	validation_0-LMAE_XGB:-1.34508
[80]	validation_0-rmse:0.431928	validation_0-LMAE_XGB:-1.34497
Stopping. Best iteration:
[57]	validation_0-rmse:0.436084	validation_0-LMAE_XGB:-1.3474

Base score: -1.3474037696099086


HBox(children=(IntProgress(value=0, max=603), HTML(value='')))

SELECTION FOR TYPE 1 ................
Loading josh ...
Loading kyle ...
Loading M101 ...
Loading M113 ...
Loading kyleB ...
Loading fc_oof_seed222 ...
Drop Duplicates...
Fill na...
Get useful features...
Label encoding...


HBox(children=(IntProgress(value=0, max=161), HTML(value='')))

(709416, 629)

 Train fold 0 ......
[0]	validation_0-rmse:16.477	validation_0-LMAE_XGB:2.44184
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:5.80517	validation_0-LMAE_XGB:1.39714
[20]	validation_0-rmse:2.13746	validation_0-LMAE_XGB:0.398252
[30]	validation_0-rmse:0.992643	validation_0-LMAE_XGB:-0.382042
[40]	validation_0-rmse:0.736126	validation_0-LMAE_XGB:-0.732612
[50]	validation_0-rmse:0.697873	validation_0-LMAE_XGB:-0.813685
[60]	validation_0-rmse:0.692984	validation_0-LMAE_XGB:-0.825201
[70]	validation_0-rmse:0.692546	validation_0-LMAE_XGB:-0.825727
[80]	validation_0-rmse:0.692632	validation_0-LMAE_XGB:-0.82513
[90]	validation_0-rmse:0.692937	validation_0-LMAE_XGB:-0.824435
Stopping. Best iteration:
[66]	validation_0-rmse:0.692525	validation_0-LMAE_XGB:-0.825808

Base score: -0.825807591252091


HBox(children=(IntProgress(value=0, max=629), HTML(value='')))


 Train fold 1 ......
[0]	validation_0-rmse:16.4657	validation_0-LMAE_XGB:2.44565
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:5.79974	validation_0-LMAE_XGB:1.40194
[20]	validation_0-rmse:2.1336	validation_0-LMAE_XGB:0.40406
[30]	validation_0-rmse:0.989496	validation_0-LMAE_XGB:-0.379993
[40]	validation_0-rmse:0.73282	validation_0-LMAE_XGB:-0.737932
[50]	validation_0-rmse:0.69528	validation_0-LMAE_XGB:-0.819878
[60]	validation_0-rmse:0.691103	validation_0-LMAE_XGB:-0.831795
[70]	validation_0-rmse:0.691305	validation_0-LMAE_XGB:-0.832224
[80]	validation_0-rmse:0.691796	validation_0-LMAE_XGB:-0.831508
[90]	validation_0-rmse:0.692067	validation_0-LMAE_XGB:-0.831022
Stopping. Best iteration:
[68]	validation_0-rmse:0.691304	validation_0-LMAE_XGB:-0.832284

Base score: -0.8322839955627681


HBox(children=(IntProgress(value=0, max=629), HTML(value='')))


 Train fold 2 ......
[0]	validation_0-rmse:16.434	validation_0-LMAE_XGB:2.44941
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:5.79406	validation_0-LMAE_XGB:1.40626
[20]	validation_0-rmse:2.13865	validation_0-LMAE_XGB:0.410919
[30]	validation_0-rmse:0.994693	validation_0-LMAE_XGB:-0.370257
[40]	validation_0-rmse:0.735852	validation_0-LMAE_XGB:-0.727403
[50]	validation_0-rmse:0.695955	validation_0-LMAE_XGB:-0.813141
[60]	validation_0-rmse:0.690664	validation_0-LMAE_XGB:-0.827944
[70]	validation_0-rmse:0.690245	validation_0-LMAE_XGB:-0.829534
[80]	validation_0-rmse:0.690306	validation_0-LMAE_XGB:-0.829316
[90]	validation_0-rmse:0.690625	validation_0-LMAE_XGB:-0.828885
[100]	validation_0-rmse:0.691005	validation_0-LMAE_XGB:-0.828276
Stopping. Best iteration:
[73]	validation_0-rmse:0.6902	validation_0-LMAE_XGB:-0.829547

Base score: -0.829546601098

HBox(children=(IntProgress(value=0, max=629), HTML(value='')))

SELECTION FOR TYPE 3 ................
Loading josh ...
Loading kyle ...
Loading M101 ...
Loading M113 ...
Loading kyleB ...
Loading fc_oof_seed222 ...
Drop Duplicates...
Fill na...
Get useful features...
Label encoding...


HBox(children=(IntProgress(value=0, max=165), HTML(value='')))

(1140674, 706)

 Train fold 0 ......
[0]	validation_0-rmse:4.07533	validation_0-LMAE_XGB:0.895905
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:1.47192	validation_0-LMAE_XGB:-0.122728
[20]	validation_0-rmse:0.613812	validation_0-LMAE_XGB:-1.0017
[30]	validation_0-rmse:0.38872	validation_0-LMAE_XGB:-1.51598
[40]	validation_0-rmse:0.347166	validation_0-LMAE_XGB:-1.67368
[50]	validation_0-rmse:0.340335	validation_0-LMAE_XGB:-1.70207
[60]	validation_0-rmse:0.338707	validation_0-LMAE_XGB:-1.70664
[70]	validation_0-rmse:0.338312	validation_0-LMAE_XGB:-1.7066
[80]	validation_0-rmse:0.337949	validation_0-LMAE_XGB:-1.7068
[90]	validation_0-rmse:0.337865	validation_0-LMAE_XGB:-1.70642
Stopping. Best iteration:
[65]	validation_0-rmse:0.338324	validation_0-LMAE_XGB:-1.70706

Base score: -1.707059650370058


HBox(children=(IntProgress(value=0, max=706), HTML(value='')))


 Train fold 1 ......
[0]	validation_0-rmse:4.063	validation_0-LMAE_XGB:0.895782
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:1.47579	validation_0-LMAE_XGB:-0.1208
[20]	validation_0-rmse:0.625532	validation_0-LMAE_XGB:-0.998578
[30]	validation_0-rmse:0.404182	validation_0-LMAE_XGB:-1.51333
[40]	validation_0-rmse:0.363533	validation_0-LMAE_XGB:-1.67173
[50]	validation_0-rmse:0.355678	validation_0-LMAE_XGB:-1.70133
[60]	validation_0-rmse:0.353929	validation_0-LMAE_XGB:-1.7063
[70]	validation_0-rmse:0.352569	validation_0-LMAE_XGB:-1.70752
[80]	validation_0-rmse:0.351818	validation_0-LMAE_XGB:-1.70808
[90]	validation_0-rmse:0.35135	validation_0-LMAE_XGB:-1.7086
[100]	validation_0-rmse:0.351163	validation_0-LMAE_XGB:-1.70821
[110]	validation_0-rmse:0.350833	validation_0-LMAE_XGB:-1.70815
[120]	validation_0-rmse:0.350611	validation_0-LMAE_XGB:-1.708

HBox(children=(IntProgress(value=0, max=706), HTML(value='')))


 Train fold 2 ......
[0]	validation_0-rmse:4.08096	validation_0-LMAE_XGB:0.897098
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:1.47446	validation_0-LMAE_XGB:-0.12047
[20]	validation_0-rmse:0.615007	validation_0-LMAE_XGB:-0.999855
[30]	validation_0-rmse:0.390867	validation_0-LMAE_XGB:-1.51585
[40]	validation_0-rmse:0.349961	validation_0-LMAE_XGB:-1.67259
[50]	validation_0-rmse:0.343275	validation_0-LMAE_XGB:-1.69947
[60]	validation_0-rmse:0.341706	validation_0-LMAE_XGB:-1.70275
[70]	validation_0-rmse:0.341036	validation_0-LMAE_XGB:-1.70257
[80]	validation_0-rmse:0.340605	validation_0-LMAE_XGB:-1.70219
[90]	validation_0-rmse:0.340335	validation_0-LMAE_XGB:-1.70231
Stopping. Best iteration:
[67]	validation_0-rmse:0.341105	validation_0-LMAE_XGB:-1.70298

Base score: -1.7029802701676047


HBox(children=(IntProgress(value=0, max=706), HTML(value='')))

SELECTION FOR TYPE 6 ................
Loading josh ...
Loading kyle ...
Loading M101 ...
Loading M113 ...
Loading kyleB ...
Loading fc_oof_seed222 ...
Drop Duplicates...
Fill na...
Get useful features...
Label encoding...


HBox(children=(IntProgress(value=0, max=164), HTML(value='')))

(1510379, 530)

 Train fold 0 ......
[0]	validation_0-rmse:2.77594	validation_0-LMAE_XGB:0.807277
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:1.04051	validation_0-LMAE_XGB:-0.211662
[20]	validation_0-rmse:0.510898	validation_0-LMAE_XGB:-1.08765
[30]	validation_0-rmse:0.392555	validation_0-LMAE_XGB:-1.57563
[40]	validation_0-rmse:0.371593	validation_0-LMAE_XGB:-1.70083
[50]	validation_0-rmse:0.367593	validation_0-LMAE_XGB:-1.71845
[60]	validation_0-rmse:0.36604	validation_0-LMAE_XGB:-1.7201
[70]	validation_0-rmse:0.365545	validation_0-LMAE_XGB:-1.71961
[80]	validation_0-rmse:0.364363	validation_0-LMAE_XGB:-1.71939
Stopping. Best iteration:
[57]	validation_0-rmse:0.366562	validation_0-LMAE_XGB:-1.72016

Base score: -1.7201563131294413


HBox(children=(IntProgress(value=0, max=530), HTML(value='')))


 Train fold 1 ......
[0]	validation_0-rmse:2.76239	validation_0-LMAE_XGB:0.807088
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:1.02518	validation_0-LMAE_XGB:-0.211909
[20]	validation_0-rmse:0.486337	validation_0-LMAE_XGB:-1.08828
[30]	validation_0-rmse:0.365316	validation_0-LMAE_XGB:-1.57678
[40]	validation_0-rmse:0.345546	validation_0-LMAE_XGB:-1.70364
[50]	validation_0-rmse:0.342696	validation_0-LMAE_XGB:-1.72027
[60]	validation_0-rmse:0.341667	validation_0-LMAE_XGB:-1.72125
[70]	validation_0-rmse:0.340942	validation_0-LMAE_XGB:-1.72135
[80]	validation_0-rmse:0.340526	validation_0-LMAE_XGB:-1.72093
Stopping. Best iteration:
[57]	validation_0-rmse:0.341755	validation_0-LMAE_XGB:-1.72146

Base score: -1.7214606910809924


HBox(children=(IntProgress(value=0, max=530), HTML(value='')))


 Train fold 2 ......
[0]	validation_0-rmse:2.77248	validation_0-LMAE_XGB:0.809411
Multiple eval metrics have been passed: 'validation_0-LMAE_XGB' will be used for early stopping.

Will train until validation_0-LMAE_XGB hasn't improved in 30 rounds.
[10]	validation_0-rmse:1.03342	validation_0-LMAE_XGB:-0.210706
[20]	validation_0-rmse:0.501666	validation_0-LMAE_XGB:-1.08784
[30]	validation_0-rmse:0.383717	validation_0-LMAE_XGB:-1.57906
[40]	validation_0-rmse:0.363154	validation_0-LMAE_XGB:-1.70616
[50]	validation_0-rmse:0.35897	validation_0-LMAE_XGB:-1.72379
[60]	validation_0-rmse:0.357819	validation_0-LMAE_XGB:-1.72525
[70]	validation_0-rmse:0.356993	validation_0-LMAE_XGB:-1.72512
[80]	validation_0-rmse:0.356601	validation_0-LMAE_XGB:-1.72475
[90]	validation_0-rmse:0.356891	validation_0-LMAE_XGB:-1.72443
Stopping. Best iteration:
[60]	validation_0-rmse:0.357819	validation_0-LMAE_XGB:-1.72525

Base score: -1.7252501225712178


HBox(children=(IntProgress(value=0, max=530), HTML(value='')))

### Try different number of top features

In [25]:
RANGE_NB_FEATURES = [60, 70, 80, 90, 100, 120, 140, 160, 180]

for bondtype in [2]:
    print('WORKING ON TYPE', bondtype)
    
    for j, (fset, ext) in enumerate(zip(FEATURE_SETS, SET_EXT)):
        print('Loading', fset, '...')
        trainfile = fset+'/'+fset+'_train_'+str(bondtype)+'.'+ext
        #testfile = fset+'/'+fset+'_test_'+str(bondtype)+'.'+ext
        if ext == 'parquet':
            temp_df = pd.read_parquet(DATAPATH+trainfile).reset_index(drop=True)
            #temp_df_test = pd.read_parquet(DATAPATH+testfile).reset_index(drop=True)
        elif ext == 'csv':
            temp_df = pd.read_csv(DATAPATH+trainfile).reset_index(drop=True) 
            #temp_df_test = pd.read_csv(DATAPATH+testfile).reset_index(drop=True)
            
        if j == 0: 
            train_df = temp_df
            #test_df = temp_df_test
        else: 
            train_df = pd.concat( [train_df, temp_df], axis=1 )
            #test_df = pd.concat( [test_df, temp_df_test], axis=1 )
    
    ## Post process feature matrix
    
    # Drop duplicate column names
    print("Drop Duplicates...")
    train_df = train_df.loc[:,~train_df.columns.duplicated()]
    #test_df = test_df.loc[:,~test_df.columns.duplicated()]
    print("Fill na...")
    train_df.fillna(0, inplace=True)
    #test_df.fillna(0, inplace=True)
    

    # Detect useful and categorical features
    print("Get useful features...")
    feature_names = pd.unique(train_df.columns.values)
    useful_features = [ f for f in feature_names if f not in REMOVED_FEATURES ]
    categorical_features = [ f for f in useful_features if f in CATEGORICAL_FEATURES[bondtype-1]]
    
    train_df = train_df[useful_features]
    #test_df = test_df[useful_features]

    # Label-encode categorical features
    print("Label encoding...")
    for col in tqdm(categorical_features):
        LE = LabelEncoder().fit(train_df[col])
        train_df[col] = LE.transform(train_df[col])
        #test_df[col] = LE.transform(test_df[col])
        
    y = pd.read_csv('info_data/info_train_type_'+str(bondtype)+'.csv', usecols=['scalar_coupling_constant']).values.flatten()
    
    LGB_PARAMS = {'num_leaves': 100,
             'min_data_in_leaf': 50, 
             'objective':'huber',
             'max_depth': 15,
             'learning_rate': 0.1,
             "boosting": "gbdt",
             "feature_fraction": 1,
             "bagging_fraction": 0.7,
             "random_state": 240691,
              "num_threads": 4
             }
    
    results = []
    ranking = pd.read_csv('feature_ranking_lgbstack_type_'+str(bondtype)+'.csv')
    for g in RANGE_NB_FEATURES:
        print('\n\n')
        print('Nb features:', g)
        good_features = ranking.head(g).feature.values
        train_df_2 = train_df[good_features]
        print(train_df_2.shape)
        
        X_tr, X_va, y_tr, y_va = train_test_split(train_df_2, y, test_size=0.2, random_state=42)
        
        lgbmodel_2 = lgb.LGBMRegressor(**LGB_PARAMS, n_estimators = 10000, n_jobs = -1) 
        lgbmodel_2.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric=LMAE_LGB,
                  verbose=500, early_stopping_rounds=50)

        score = lgbmodel_2.evals_result_['valid_0']['LMAE_LGB'][-1]
        results.append(score)
    
    result_df = pd.DataFrame(data={'nb_features': RANGE_NB_FEATURES, 'score': results})
    result_df.to_csv('permute_result_lgbstack_type_'+str(bondtype)+'.csv', index=None)

WORKING ON TYPE 2
Loading josh ...
Loading kyle ...
Loading M101 ...
Loading M113 ...
Loading kyleB ...
Loading fc_oof_seed222 ...
Drop Duplicates...
Fill na...
Get useful features...
Label encoding...


HBox(children=(IntProgress(value=0, max=170), HTML(value='')))




Nb features: 60
(43363, 60)
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's huber: 0.0542751	valid_0's LMAE_LGB: -1.48885
Early stopping, best iteration is:
[743]	valid_0's huber: 0.0531988	valid_0's LMAE_LGB: -1.49364



Nb features: 70
(43363, 70)
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's huber: 0.0541235	valid_0's LMAE_LGB: -1.49004
Early stopping, best iteration is:
[713]	valid_0's huber: 0.0529959	valid_0's LMAE_LGB: -1.49715



Nb features: 80
(43363, 80)
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's huber: 0.0537437	valid_0's LMAE_LGB: -1.49573
Early stopping, best iteration is:
[894]	valid_0's huber: 0.0525891	valid_0's LMAE_LGB: -1.50123



Nb features: 90
(43363, 90)
Training until validation scores don't improve for 50 rounds.
[500]	valid_0's huber: 0.0532199	valid_0's LMAE_LGB: -1.49682
[1000]	valid_0's huber: 0.0519514	valid_0's LMAE_LGB: -1.50465
Early stopping, best itera