# Null importance for feature selection

In [None]:
### https://www.kaggle.com/ogrellier/feature-selection-with-null-importances

In [None]:
#PCA function with normalization
def doPCAwS_null(df_train, n):
    #Create index dataframe to join later
    df_train_idx = df_train.index.values.tolist()
    df_train_idx = pd.DataFrame(df_train_idx).rename(columns = {0 : 'idx'})

    scaler = StandardScaler()
    scaler.fit(df_train)
#     print(scaler.mean_)
    X_train = scaler.transform(df_train)

    
    #Calling PCA function and fitting
    pca = PCA(n_components=n)
    pca.fit(X_train)
    #The amount of variance that each PC explains
    var = pca.explained_variance_ratio_
    #Cumulative Variance explains
    cum_var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
    print (cum_var)
    #Transform into PCs
    X_train_tfm = pca.transform(X_train)
    df_train_pca = pd.DataFrame(X_train_tfm)

    #Join Index with non-PCA columns
    df_train_pca = df_train_idx.join(df_train_pca).set_index(['idx'])

    del df_train_pca.index.name
    return df_train_pca

#Impute missing values for PCA columns with mean
def impute_PCA(df,col):
    df[col] = df[col].fillna(df[col].mean())
    return df

#Function to replace the original columns with PCA columns 
def PCA_FeatureSet(df,feats,PCA_df):
    df_n = df.drop(feats, axis=1)
    df_n = pd.concat([df_n, PCA_df], axis=1)
    return df_n

def PCA_Master_null(train_x, PCA_feats, num, prefix = 'PCA_PREFIX'):    
    #########PCA operation and recover the dataset#####################
    #Columns and Imputation
    train_x = impute_PCA(train_x,PCA_feats)
    #PCA
    train_x_pca = doPCAwS_null(train_x[PCA_feats], num)
    #Rename the PCA columns
    train_x_pca = train_x_pca.add_prefix(prefix)
        
    #Replace the original columns with PCA columns 
    train_x_pca = PCA_FeatureSet(train_x, PCA_feats, train_x_pca)
        
    #Updated list of columns to consider for test
    feats_new = [f for f in train_x_pca.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    ####################################################################
    return train_x_pca, feats_new

In [None]:
## Implement PCA on training set
train_x_pca, feats_new = PCA_Master_null(df_tr, PCA_feats[0], 15, prefix = 'PCA_APP_DIM_')
train_x_pca, feats_new = PCA_Master_null(train_x_pca, PCA_feats[1], 33, prefix = 'PCA_PREV_APPREF_')
df_tr, feats_new = PCA_Master_null(train_x_pca, PCA_feats[2], 55, prefix = 'PCA_CC_')

In [None]:
data = df_tr
data = pd.concat([data, y], axis=1)
del df_tr
gc.collect()

In [None]:
## Thanks to Olivier
def get_feature_importances(data, shuffle, seed=None):
    # Gather real features
    train_features = [f for f in data if f not in ['TARGET', 'SK_ID_CURR']]
    # Go over fold and keep track of CV score (train and valid) and feature importances
    # Shuffle target if required
    y = data['TARGET'].copy()
    if shuffle:
        # Here you could as well use a binomial distribution
        y = data['TARGET'].copy().sample(frac=1.0)
    # Fit LightGBM in RF mode, yes it's quicker than sklearn RandomForest
    dtrain = lgb.Dataset(data[train_features], y, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'rf',
        'subsample': 0.623,
        'colsample_bytree': 0.7,
        'num_leaves': 127,
        'max_depth': 8,
        'seed': 123,
        'bagging_freq': 1,
        'n_jobs': 4
    }
    
    # Fit the model
    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=200)
    
    # Get feature importances
    imp_df = pd.DataFrame()
    imp_df["feature"] = list(train_features)
    imp_df["importance_gain"] = clf.feature_importance(importance_type='gain')
    imp_df["importance_split"] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(data[train_features]))
    
    return imp_df

In [None]:
# Seed the unexpected randomness of this world
np.random.seed(123)
# Get the actual importance, i.e. without shuffling
actual_imp_df = get_feature_importances(data=data, shuffle=False)

In [None]:
null_imp_df = pd.DataFrame()
nb_runs = 80
import time
start = time.time()
dsp = ''
for i in range(nb_runs):
    # Get current run importances
    imp_df = get_feature_importances(data=data, shuffle=True)
    imp_df['run'] = i + 1 
    # Concat the latest importances with the old ones
    null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)
    # Erase previous message
    for l in range(len(dsp)):
        print('\b', end='', flush=True)
    # Display current run and time used
    spent = (time.time() - start) / 60
    dsp = 'Done with %4d of %4d (Spent %5.1f min)' % (i + 1, nb_runs, spent)
    print(dsp, end='', flush=True)

In [None]:
# Saving the output of run
null_imp_df.to_csv(dir+'null_importances_distribution_rf.csv')
actual_imp_df.to_csv(dir+'actual_importances_ditribution_rf.csv')

In [None]:
def display_distributions(actual_imp_df_, null_imp_df_, feature_):
    plt.figure(figsize=(13, 6))
    gs = gridspec.GridSpec(1, 2)
    # Plot Split importances
    ax = plt.subplot(gs[0, 0])
    a = ax.hist(null_imp_df_.loc[null_imp_df_['feature'] == feature_, 'importance_split'].values, label='Null importances')
    ax.vlines(x=actual_imp_df_.loc[actual_imp_df_['feature'] == feature_, 'importance_split'].mean(), 
               ymin=0, ymax=np.max(a[0]), color='r',linewidth=10, label='Real Target')
    ax.legend()
    ax.set_title('Split Importance of %s' % feature_.upper(), fontweight='bold')
    plt.xlabel('Null Importance (split) Distribution for %s ' % feature_.upper())
    # Plot Gain importances
    ax = plt.subplot(gs[0, 1])
    a = ax.hist(null_imp_df_.loc[null_imp_df_['feature'] == feature_, 'importance_gain'].values, label='Null importances')
    ax.vlines(x=actual_imp_df_.loc[actual_imp_df_['feature'] == feature_, 'importance_gain'].mean(), 
               ymin=0, ymax=np.max(a[0]), color='r',linewidth=10, label='Real Target')
    ax.legend()
    ax.set_title('Gain Importance of %s' % feature_.upper(), fontweight='bold')
    plt.xlabel('Null Importance (gain) Distribution for %s ' % feature_.upper())

In [None]:
# display_distributions(actual_imp_df_=actual_imp_df, null_imp_df_=null_imp_df, feature_='EXT_SOURCE_2')

In [None]:
feature_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps_gain = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps_gain = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].mean()
    gain_score = np.log(1e-10 + f_act_imps_gain / (1 + np.percentile(f_null_imps_gain, 75)))  # Avoid divide by zero
    f_null_imps_split = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps_split = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].mean()
    split_score = np.log(1e-10 + f_act_imps_split / (1 + np.percentile(f_null_imps_split, 75)))  # Avoid divide by zero
    feature_scores.append((_f, split_score, gain_score))

scores_df = pd.DataFrame(feature_scores, columns=['feature', 'split_score', 'gain_score'])

plt.figure(figsize=(24, 24))
gs = gridspec.GridSpec(1, 2)
# Plot Split importances
ax = plt.subplot(gs[0, 0])
sns.barplot(x='split_score', y='feature', data=scores_df.sort_values('split_score', ascending=False).iloc[0:100], ax=ax)
ax.set_title('Feature scores wrt split importances', fontweight='bold', fontsize=12)
# Plot Gain importances
ax = plt.subplot(gs[0, 1])
sns.barplot(x='gain_score', y='feature', data=scores_df.sort_values('gain_score', ascending=False).iloc[0:100], ax=ax)
ax.set_title('Feature scores wrt gain importances', fontweight='bold', fontsize=12)
plt.tight_layout()

In [None]:
correlation_scores = []
for _f in actual_imp_df['feature'].unique():
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_gain'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_gain'].values
    gain_score = 100 * (f_null_imps < f_act_imps).sum() / f_null_imps.size
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == _f, 'importance_split'].values
    f_act_imps = actual_imp_df.loc[actual_imp_df['feature'] == _f, 'importance_split'].values
    split_score = 100 * (f_null_imps < f_act_imps).sum() / f_null_imps.size
    correlation_scores.append((_f, split_score, gain_score))

corr_scores_df = pd.DataFrame(correlation_scores, columns=['feature', 'split_score', 'gain_score'])

fig = plt.figure(figsize=(24, 24))
gs = gridspec.GridSpec(1, 2)
# Plot Split importances
ax = plt.subplot(gs[0, 0])
sns.barplot(x='split_score', y='feature', data=corr_scores_df.sort_values('split_score', ascending=False).iloc[0:100], ax=ax)
ax.set_title('Feature scores wrt split importances', fontweight='bold', fontsize=14)
# Plot Gain importances
ax = plt.subplot(gs[0, 1])
sns.barplot(x='gain_score', y='feature', data=corr_scores_df.sort_values('gain_score', ascending=False).iloc[0:100], ax=ax)
ax.set_title('Feature scores wrt gain importances', fontweight='bold', fontsize=14)
plt.tight_layout()
plt.suptitle("Features' split and gain scores", fontweight='bold', fontsize=16)
fig.subplots_adjust(top=0.93)

In [None]:
def score_feature_selection(df=None, train_features=None, cat_feats=None, target=None):
    # Fit LightGBM 
    dtrain = lgb.Dataset(df[train_features], target, free_raw_data=False, silent=True)
    lgb_params = {
        'objective': 'binary',
        'boosting_type': 'gbdt',
        'learning_rate': .1,
        'num_leaves': 20,
        'colsample_bytree': 0.9497036,
        'subsample': 0.8715623,
        'bagging_freq': 1,
        'max_depth': -1,
        'reg_alpha': 0.042,
        'reg_lambda': 0.0735,
        'min_split_gain': 0.0222415,
        'min_child_weight': 65,
        'subsample': 0.8,
        'seed': 13,
        'n_jobs': 4,
        'metric': 'auc'
    }
    
    # Fit the model
    hist = lgb.cv(
        params=lgb_params, 
        train_set=dtrain, 
        num_boost_round=2000,
        nfold=5,
        stratified=True,
        shuffle=True,
        early_stopping_rounds=50,
        verbose_eval=0,
        seed=17
    )
    # Return the last mean / std values 
    return hist['auc-mean'][-1], hist['auc-stdv'][-1]

# features = [f for f in data.columns if f not in ['SK_ID_CURR', 'TARGET']]
# score_feature_selection(df=data[features], train_features=features, target=data['TARGET'])
for threshold in [0, 20, 40, 60 , 80 , 90, 95, 99]:
# for threshold in [0, 10, 20, 30 , 40, 50 ,60 , 70, 80 , 90, 95, 99]:
    split_feats = [_f for _f, _score, _ in correlation_scores if _score >= threshold]
    print(len(split_feats))
#     split_cat_feats = [_f for _f, _score, _ in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]
    gain_feats = [_f for _f, _, _score in correlation_scores if _score >= threshold]
    print(len(gain_feats))
#     gain_cat_feats = [_f for _f, _, _score in correlation_scores if (_score >= threshold) & (_f in categorical_feats)]
                                                                                             
    print('Results for threshold %3d' % threshold)
    split_results = score_feature_selection(df=data, train_features=split_feats, target=data['TARGET'])
    print('\t SPLIT : %.6f +/- %.6f' % (split_results[0], split_results[1]))
    gain_results = score_feature_selection(df=data, train_features=gain_feats, target=data['TARGET'])
    print('\t GAIN  : %.6f +/- %.6f' % (gain_results[0], gain_results[1]))

In [None]:
sc_split_feats = scores_df.sort_values(by='split_score', ascending=False).head(700)
sc_gain_feats = scores_df.sort_values(by='gain_score', ascending=False).head(700)
sc_split_gain_feats = sc_split_feats['feature'].append(sc_gain_feats['feature'])
sc_feats = sc_split_gain_feats.unique().tolist()

In [None]:
corr_split_feats = corr_scores_df.sort_values(by='split_score', ascending=False).head(890)
corr_gain_feats = corr_scores_df.sort_values(by='gain_score', ascending=False).head(995)
corr_split_gain_feats = corr_split_feats['feature'].append(corr_gain_feats['feature'])
corr_feats = corr_split_gain_feats.unique().tolist()

In [None]:
all_feats = corr_scores_df.feature.unique().tolist() ### scores_df can also be used to get all features
len(all_feats)
null_feats = list(set([*corr_feats,*sc_feats]))
len(null_feats)
useless = list(set(all_feats) - set(null_feats))
useless.sort()
# useless

# Using SHAP (SHapley Additive exPlanations)

In [None]:
### https://www.kaggle.com/alijs1/explaining-model-s-predictions
### https://www.kaggle.com/slundberg/interpreting-a-lightgbm-model
### https://www.kaggle.com/hmendonca/lightgbm-predictions-explained-with-shap-0-796

In [None]:
def train_lightgbm(df, num_boost_round=100, debug= False):
    # Divide in training/validation and test data
    train_df = df[df['TARGET'].notnull()].copy()
    print("Starting LightGBM. Train shape: {}".format(train_df.shape))
    del df
    gc.collect()

    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    
    train_x_pca, feats_new = PCA_Master_null(train_df, PCA_feats[0], 15, prefix = 'PCA_APP_DIM_')
    train_x_pca, feats_new = PCA_Master_null(train_x_pca, PCA_feats[1], 33, prefix = 'PCA_PREV_APPREF_')
    train_df, feats_new = PCA_Master_null(train_x_pca, PCA_feats[2], 55, prefix = 'PCA_CC_')

    params = {
        'objective':'binary',
        'metric':'auc',
        'nthread':16,
        'learning_rate':0.01,
        'num_leaves':36,
        'colsample_bytree':0.10442488,
        'subsample':0.9290019,
        'bagging_freq':1,
        'max_depth':8,
        'reg_alpha':4.99842044,
        'reg_lambda':1.60494325,
        'min_split_gain':0.0753679496,
        'min_child_weight':47.4521998,
        'scale_pos_weight': 2.398597,
        'verbose':500
    }
    
    train_x = lgb.Dataset(train_df[feats_new], train_df['TARGET'], silent=True)
    clf = lgb.train(params, train_x, num_boost_round)
    return clf, train_df[feats_new], train_df['TARGET']

In [None]:
clf, train_df, train_y = train_lightgbm(df, num_boost_round = 7000)

In [None]:
import shap
shap.initjs()

In [None]:
###high speed algorithm to compute SHAP values for LightGBM (and XGBoost and CatBoost) - use clf.booster_
explainer = shap.TreeExplainer(clf.booster_)
shap_values = explainer.shap_values(train_df)

In [None]:
print('Truth:', train_y[254551])

In [None]:
# visualize the first prediction's explanation
shap.force_plot(explainer.expected_value, shap_values[254551,:], train_df.iloc[254551,:], link='logit')

In [None]:
shap.dependence_plot("NEW_EXT_SOURCES_MEAN", shap_values, train_df)

# Bayesian Optimization - parameter tuning

In [None]:
# https://www.kaggle.com/sz8416/simple-bayesian-optimization-for-lightgbm
# !pip3 install bayesian-optimization
from bayes_opt import BayesianOptimization

In [1]:
def bayes_parameter_opt_lgb(df, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.1, output_process=False):
    # prepare data
    df_train = df[df['TARGET'].notnull()]
#     df_train = df_train[:10000]
    # Target
    y = df_train['TARGET'].copy()
    feats = [f for f in df_train.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index']]
    df_train = df_train[feats]
    
    train_x_pca, feats_new = PCA_Master_null(df_train, PCA_feats[0], 15, prefix = 'PCA_APP_DIM_')
    print(len(feats_new))
    train_x_pca, feats_new = PCA_Master_null(train_x_pca, PCA_feats[1], 33, prefix = 'PCA_PREV_APPREF_')
    print(len(feats_new))
    df_train, feats_new = PCA_Master_null(train_x_pca, PCA_feats[2], 55, prefix = 'PCA_CC_')
    print(len(feats_new))
    
#     df_train = df_train[null_feats]
#     print(df_train.shape)
    
    train_data = lgb.Dataset(data=df_train, label = y, free_raw_data=False)
    # parameters
    def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, 
                 min_split_gain, min_child_weight,scale_pos_weight,min_data_in_leaf):
        params = {'application':'binary','num_iterations': n_estimators,'learning_rate':learning_rate, 
                  'early_stopping_round':150, 'metric':'auc','n_jobs':16}
        
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params['scale_pos_weight'] = scale_pos_weight
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))

        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, 
                           verbose_eval = 500, metrics=['auc'])
        return max(cv_result['auc-mean'])
    # range 
    lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (20, 35),
                                            'feature_fraction': (0.1, 0.3),
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (4, 9),
                                            'lambda_l1': (0, 5),
                                            'lambda_l2': (0, 3),
                                            'min_split_gain': (0.01, 0.1),
                                            'scale_pos_weight': (2, 4),
                                            'min_data_in_leaf': (20, 500),
                                            'min_child_weight': (20, 50)}, random_state=0)
    # optimize
    lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
    # output optimization process
    if output_process==True: lgbBO.points_to_csv(dir+"bayes_opt_result.csv")
    
    # return best parameters
    return lgbBO.res['max']['max_params']

In [None]:
opt_params = bayes_parameter_opt_lgb(df, init_round=10, opt_round=15, n_folds=5, random_seed=123456, n_estimators=10000, learning_rate=0.02)
print(opt_params)


# Ensemble Voting on OOF predictions

In [3]:
import glob
import pandas as pd

import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")
import numpy as np

dir = '../Documents/JK/Home_Credit_Default_Risk/'

In [None]:
# USING Voting METHOD
data = {}

for path in glob.glob("../Documents/JK/Home_Credit_Default_Risk/Oof_files/*.csv", recursive=True):
    data[path[65:-4]] = pd.read_csv(path, header=None)

oof_preds = pd.DataFrame(columns=data.keys())

id_tgt = pd.read_csv('../Documents/JK/Home_Credit_Default_Risk/Oof_files/ID_TGT/id_target.csv', index_col=0).reset_index(drop=True)
df = id_tgt.join(oof_preds)

for key in data.keys():
    df[key] = data[key]

df.head()

In [None]:
df['GP'] = df['GP'].replace(0.0, np.nan)
m = df.iloc[:, 2:].mean(axis=1)
for i, col in enumerate(df):
    df.iloc[:, i] = df.iloc[:, i].fillna(m)
    
##df.loc[df.SK_ID_CURR.isin(['141289','144669','196708','319880'])]

In [None]:
data_sub = {}

for path in glob.glob("../Documents/JK/Home_Credit_Default_Risk/Test/*.csv", recursive=True):
    data_sub[path[65:-4]] = pd.read_csv(path, header=0)

sub_preds = pd.DataFrame(columns=data_sub.keys())

id_test = pd.read_csv('../Documents/JK/Home_Credit_Default_Risk/Test/sample/sample_submission.csv')
id_test.drop('TARGET',axis=1,inplace=True)
df_test = id_test.join(sub_preds)

for key in data_sub.keys():
    df_test[key] = data_sub[key].TARGET
    
df_test.head()

In [None]:
df_test.corr()

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import roc_auc_score

In [None]:
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(n_estimators = 500, random_state=1, warm_start = True)
clf3 = GaussianNB()
clf4 = KNeighborsClassifier(n_neighbors = 1000)
clf5 = AdaBoostClassifier()

In [None]:
X = df.drop(['TARGET','SK_ID_CURR'], axis=1)
y = df['TARGET']

In [None]:
eclf = VotingClassifier(estimators=[('lr', clf1), ('gnb', clf3), ('knn', clf4),
                                    ('ada', clf5)],
#                                     weights=[1,1,1,1], 
                                    voting='soft')
eclf = eclf.fit(X, y)
eclf_preds = np.zeros(df.shape[0])
eclf_preds = eclf.predict_proba(X)[:,1]
# eclf_tfm = eclf.transform(X)

print('Full AUC score %.6f' % roc_auc_score(y, eclf_preds))

In [None]:
models = [clf1, clf3, clf4, clf5]

from sklearn.model_selection import cross_val_score

for model in models:
    scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc')
    print(str(model)[:5], scores.mean(), scores.std())

In [None]:
X_test = df_test.drop(['SK_ID_CURR'], axis=1)
eclf_sub_preds = np.zeros(df_test.shape[0])
eclf_sub_preds = eclf.predict_proba(X_test)[:,1]

In [None]:
Submission=pd.read_csv(dir+"sample_submission.csv")
Submission['TARGET']=eclf_sub_preds.copy()
Submission.to_csv(dir+"submission_Voting.csv", index= False)
Submission.head()

# Blending final Submissions

In [None]:
# USING WEIGHTED AVERAGE RANK METHOD
data = {}

for path in glob.glob(dir+"Blend/*.csv", recursive=True):
    data[path[47:-4]] = pd.read_csv(path)

ranks = pd.DataFrame(columns=data.keys())

In [None]:
for key in data.keys():
    ranks[key] = data[key].TARGET.rank(method='min')
ranks['Average'] = ranks.mean(axis=1)
ranks['Scaled Rank'] = (ranks['Average'] - ranks['Average'].min()) / (ranks['Average'].max() - ranks['Average'].min())
ranks.corr()[:1]

In [None]:
weights = [0.7,0.3]

In [None]:
ranks['Score'] = ranks[['kaggle_rankavg_LB_top_805','submission_rankavg_blend_voting']].mul(weights).sum(1) / ranks.shape[0]


In [None]:
submission_lb = pd.read_csv(dir+"sample_submission.csv")
submission_lb['TARGET'] = ranks['Score']
submission_lb.to_csv(dir+"Blend/Blend_of_final.csv", index=None)
submission_lb.head()

# RANK AVERAGING - ENSEMBLE GUIDE

In [None]:
## https://github.com/MLWave/Kaggle-Ensemble-Guide

## EXT_SOURCE imputation - continuous values

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# LightGBM GBDT with KFold or Stratified KFold
def kfold_lightgbm(df, Ext_col, PCA_feats, null_feats, stratified = False, debug= False):
    # Divide in training/validation and test data
    train_df = df[df[Ext_col].notnull()]
#     train_df = train_df[0:10000]
    test_df = df[df[Ext_col].isnull()].copy()
    print("Starting LightGBM. Train shape: {}, test shape: {}".format(train_df.shape, test_df.shape))
    del df
    gc.collect()
    
    # Create arrays and dataframes to store results
    oof_preds = np.zeros(train_df.shape[0])
    sub_preds = np.zeros(test_df.shape[0])
    feature_importance_df = pd.DataFrame()
    feats = [f for f in train_df.columns if f not in ['TARGET','SK_ID_CURR','SK_ID_BUREAU','SK_ID_PREV','index',Ext_col]]
    
    
    train_x, valid_x, train_y, valid_y = train_test_split(train_df[feats], train_df[Ext_col], test_size=0.2, random_state=420)

    print(train_x.shape, valid_x.shape, train_y.shape,valid_y.shape)
    ############# PCA operations ###################################
    print (len(feats))
    train_x_pca, valid_x_pca, test_x_pca, feats_new = PCA_Master(train_x, valid_x, test_df, PCA_feats[0], 15, prefix = 'PCA_APP_DIM_')
    print (len(feats_new))
    train_x_pca, valid_x_pca, test_x_pca, feats_new = PCA_Master(train_x_pca,valid_x_pca,test_x_pca,PCA_feats[1], 35, prefix = 'PCA_PREV_APPREF_')
    print (len(feats_new))
    train_x_pca, valid_x_pca, test_x_pca, feats_new = PCA_Master(train_x_pca,valid_x_pca,test_x_pca,PCA_feats[2], 55, prefix = 'PCA_CC_')
    print (len(feats_new))
    #################################################################   
#     null_feats = [col for col in null_feats if col in feats_new]
    
#     train_x_pca, valid_x_pca, test_x_pca = train_x_pca[null_feats], valid_x_pca[null_feats], test_x_pca[null_feats]
    print(train_x_pca.shape)

    # create dataset for lightgbm
    lgb_train = lgb.Dataset(train_x_pca, train_y)
    lgb_eval = lgb.Dataset(valid_x_pca, valid_y, reference=lgb_train)
    
    watchlist = [lgb_train, lgb_eval]

    # specify your configurations as a dict
    params = {'task': 'train',
                  'boosting_type': 'gbdt',
                  'objective': 'regression',
                  'metric': {'rmse','l2','l1'},
                  'num_leaves': 25,
                  'learning_rate': 0.01,
                  'feature_fraction': 0.11384,
                  'bagging_fraction': 0.893746,
                  'bagging_freq': 1,
                  'reg_alpha': 4.596,
                  'reg_lambda': 2.836,
                  'silent': -1,
                  'verbose': -1
              }

    print('Start training...')
    # train
    gbm = lgb.train(params,
                lgb_train,
                num_boost_round=15000,
                valid_sets=lgb_eval,
                early_stopping_rounds=300,
                verbose_eval=1000)

#       print('Save model...')
#       # save model to file
#       gbm.save_model('model.txt')

    print('Start predicting...')
    # predict
    oof_preds = gbm.predict(valid_x_pca, num_iteration=gbm.best_iteration)
#     sub_preds += gbm.predict(test_x_pca[null_feats], num_iteration=gbm.best_iteration) 
    sub_preds += gbm.predict(test_x_pca[feats_new], num_iteration=gbm.best_iteration)

    # eval
    print('The rmse of prediction is: ', mean_squared_error(valid_y, oof_preds) ** 0.5)
    
    del gbm, train_x, train_y, valid_x, valid_y
    gc.collect()
       
    np.savetxt(Ext_col+".csv", sub_preds, delimiter=",")    

    return feature_importance_df

In [None]:
#Imputation
feat_importance3 = kfold_lightgbm(df,'EXT_SOURCE_3', PCA_feats, null_feats)

### Holdout -- First attempt -- not properly tested

In [None]:
# train = df[df['TARGET'].notnull()]
# # train = train[:10000]
# test = df[df['TARGET'].isnull()].copy()
# test.drop('TARGET', axis=1, inplace=True)

# from sklearn.cross_validation import KFold

# # Some useful parameters which will come in handy later on
# ntrain = train.shape[0]
# ntest = test.shape[0]
# SEED = 0 # for reproducibility
# NFOLDS = 5 # set folds for out-of-fold prediction
# kf = KFold(ntrain, n_folds = NFOLDS, random_state=SEED)

# # Class to extend the Sklearn classifier
# class SklearnHelper(object):
#     def __init__(self, clf, seed=0, params=None):
#         params['random_state'] = seed
#         self.clf = clf(**params)

#     def train(self, x_train, y_train):
#         self.clf.fit(x_train, y_train)

#     def predict(self, x):
#         return self.clf.predict(x)
    
#     def fit(self,x,y):
#         return self.clf.fit(x,y)
    
#     def feature_importances(self,x,y):
#         print(self.clf.fit(x,y).feature_importances_)
    
# # Class to extend LightGBM classifer

In [None]:
# def get_oof(clf, x_train, y_train, x_test):
#     oof_train = np.zeros((ntrain,))
#     oof_test = np.zeros((ntest,))
#     oof_test_skf = np.empty((NFOLDS, ntest))
    
#     print (len(oof_train), len(oof_test), np.shape(oof_test_skf))
#     print (x_train.shape, y_train.shape, x_test.shape)

#     for i, (train_index, test_index) in enumerate(kf):
#         x_tr = x_train[train_index]
#         y_tr = y_train[train_index]
#         x_te = x_train[test_index]

#         print (x_tr.shape, y_tr.shape, x_te.shape) 
        
#         clf.train(x_tr, y_tr)
        
#         oof_train[test_index] = clf.predict(x_te)
        
#         print (oof_train.shape) 
        
#         oof_test_skf[i, :] = clf.predict(x_test)

#     oof_test[:] = oof_test_skf.mean(axis=0)
#     return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [None]:
# # Put in our parameters for said classifiers
# # Random Forest parameters
# lgb_params = {'objective' : 'binary',
#           'boosting_type': 'gbdt',
#           'metric' : 'auc',
#           'nthread' : 4,
#           'shrinkage_rate':0.1,
#           'max_depth':8,
#           'min_child_weight':65,
#           'bagging_fraction':0.9497036,
#           'feature_fraction':0.8715623,
#           'bagging_freq' : 1,
# #           'max_bin':50,
#           'lambda_l1':0.041545473,
#           'lambda_l2':0.0735294,
#           'num_leaves':20,
# #           'min_data_in_leaf':50,
#           'min_gain_to_split':0.0222415}

In [None]:
# lgb = SklearnHelper(clf=LGBMClassifier, seed=SEED, params=lgb_params)

In [None]:
# # # Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
# y_train = train['TARGET'].ravel()
# train = train.drop(['TARGET'], axis=1)
# x_train = train.values # Creates an array of the train data
# x_test = test.values # Creats an array of the test data

# print (train.shape, y_train.shape)
# print (len(x_train), len(x_test))

In [None]:
# lgb_oof_train, lgb_oof_test = get_oof(lgb,x_train, y_train, x_test) # LightGBM

In [None]:
# base_predictions_train = pd.DataFrame( {'LightGBM': lgb_oof_train.ravel()})
# base_predictions_train.head()

In [None]:
# x_train = np.concatenate(( lgb_oof_train), axis=1)
# x_test = np.concatenate(( lgb_oof_test), axis=1)

### Tune and compare XGB, LightGBM, RF with Hyperopt

In [None]:
# https://www.kaggle.com/eikedehling/tune-and-compare-xgb-lightgbm-rf-with-hyperopt