# feature selection 
reference : https://www.kaggle.com/willkoehrsen/introduction-to-feature-selection

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier

import time
import gc
from tqdm import tqdm

In [2]:
st = time.time()
df = pd.read_csv('./new_feature_by_kageyama_feature_matrix.csv')
time.time() - st

124.8741147518158

# drop IDs


In [3]:
id_columns = []
for c in df.columns:
    if 'SK_ID' in c :
        print(c)
        if c != 'SK_ID_CURR':
            id_columns.append(c)
df = df.drop(id_columns, axis = 1)

SK_ID_CURR
STD(installments.SK_ID_CURR)
MAX(installments.SK_ID_CURR)
SKEW(installments.SK_ID_CURR)
MIN(installments.SK_ID_CURR)
MEAN(installments.SK_ID_CURR)
STD(cash.SK_ID_CURR)
MAX(cash.SK_ID_CURR)
SKEW(cash.SK_ID_CURR)
MIN(cash.SK_ID_CURR)
MEAN(cash.SK_ID_CURR)
STD(credit.SK_ID_CURR)
MAX(credit.SK_ID_CURR)
SKEW(credit.SK_ID_CURR)
MIN(credit.SK_ID_CURR)
MEAN(credit.SK_ID_CURR)
STD(previous.MAX(installments.SK_ID_CURR))
STD(previous.SKEW(installments.SK_ID_CURR))
STD(previous.MIN(installments.SK_ID_CURR))
STD(previous.MEAN(installments.SK_ID_CURR))
STD(previous.MAX(cash.SK_ID_CURR))
STD(previous.SKEW(cash.SK_ID_CURR))
STD(previous.MIN(cash.SK_ID_CURR))
STD(previous.MEAN(cash.SK_ID_CURR))
STD(previous.MAX(credit.SK_ID_CURR))
STD(previous.SKEW(credit.SK_ID_CURR))
STD(previous.MIN(credit.SK_ID_CURR))
STD(previous.MEAN(credit.SK_ID_CURR))
MAX(previous.STD(installments.SK_ID_CURR))
MAX(previous.SKEW(installments.SK_ID_CURR))
MAX(previous.MIN(installments.SK_ID_CURR))
MAX(previous.MEAN(inst

In [4]:
df.to_csv('new_feature_by_kageyama_feature_matrix_noIDs.csv', index = False)

# Remove missing values

In [5]:
train_df = df.loc[df['set'] == 'train'] .drop('set', axis = 1)
test_df = df.loc[df['set'] == 'test'] .drop(['set', 'TARGET'], axis=1)

print('Training set full shape: ', train_df.shape)
print('Testing set full shape: ' , test_df.shape)

# Train missing values (in percent)
train_missing = (train_df.isnull().sum() / len(train_df)).sort_values(ascending = False)
train_missing.head()

# Test missing values (in percent)
test_missing = (test_df.isnull().sum() / len(test_df)).sort_values(ascending = False)
test_missing.head()

# Identify missing values above threshold
train_missing = train_missing.index[train_missing > 0.75]
test_missing = test_missing.index[test_missing > 0.75]

all_missing = list(set(set(train_missing) | set(test_missing)))
print('There are %d columns with more than 75%% missing values' % len(all_missing))

# Need to save the labels because aligning will remove this column
train_labels = train_df["TARGET"]

train_df = train_df.drop(columns = all_missing)
test_df = test_df.drop(columns = all_missing)

print('Training set full shape: ', train_df.shape)
print('Testing set full shape: ' , test_df.shape)

Training set full shape:  (307507, 1352)
Testing set full shape:  (48744, 1351)
There are 266 columns with more than 75% missing values
Training set full shape:  (307507, 1086)
Testing set full shape:  (48744, 1085)


In [6]:
df = df.drop(all_missing, axis = 1)
df.to_csv('new_feature_by_kageyama_feature_matrix_noIDs_delmissing.csv', index = False)

In [7]:
df = df.drop('SK_ID_CURR', axis = 1)

# Lable Encoding

In [8]:
train_df = df.loc[df['set'] == 'train'] .drop('set', axis = 1)
test_df = df.loc[df['set'] == 'test'] .drop(['set', 'TARGET'], axis=1)
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
categorical_feats = [f for f in df.columns if df[f].dtype == 'object']
for col in tqdm(categorical_feats):
    if col == 'set' :
        continue
    df[col] = df[col].astype('str')
    le.fit(df[col])
    df[col] = le.transform(df[col])
train_df = df.loc[df['set'] == 'train'] .drop('set', axis = 1)
test_df = df.loc[df['set'] == 'test'] .drop(['set', 'TARGET'], axis=1)

del df
gc.collect()

100%|██████████| 39/39 [00:42<00:00,  1.08s/it]


36

In [9]:
print('Training shape: ', train_df.shape)
print('Testing shape: ', test_df.shape)

Training shape:  (307507, 1085)
Testing shape:  (48744, 1084)


# make prediction

In [None]:
oof_preds = np.zeros(train_df.shape[0])
sub_preds = np.zeros(test_df.shape[0])

feature_importance_df = pd.DataFrame()
y = train_df['TARGET'].copy()
X = train_df.drop('TARGET', axis = 1)


feats =  list(X.columns)

folds = KFold(n_splits=25, shuffle=True, random_state=2018)
for n_fold, (trn_idx, val_idx) in enumerate(folds.split(X)):
    train_X, train_y = X.iloc[trn_idx], y.iloc[trn_idx]
    valid_X, valid_y = X.iloc[val_idx], y.iloc[val_idx]

    clf = LGBMClassifier(
        n_estimators=4000,
        learning_rate=0.02,
        num_leaves=32,
        colsample_bytree=.8,
        subsample=.87,
        max_depth=8,
        reg_alpha=.0415,
        reg_lambda=.0735,
        min_split_gain=.02,
        min_child_weight=40,
        silent=-1,
        verbose=-1,
    )

    clf.fit(train_X, train_y, 
            eval_set= [(train_X, train_y), (valid_X, valid_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100  #30
           )

    oof_preds[val_idx] = clf.predict_proba(valid_X, num_iteration=clf.best_iteration_)[:, 1]
    sub_preds += clf.predict_proba(test_df, num_iteration=clf.best_iteration_)[:, 1] / folds.n_splits

    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importances_
    fold_importance_df["fold"] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

    print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(valid_y, oof_preds[val_idx])))
    del clf, train_X, train_y, valid_X, valid_y
    gc.collect()

print('Full AUC score %.6f' % roc_auc_score(y, oof_preds)) 

sub = pd.read_csv('../input/sample_submission.csv')
sub['TARGET'] = sub_preds



In [None]:
sub.to_csv('./kageyama_feature_matrix_25cv_tuned_by_Ivan_noIDs_delmissing_sub.csv', index=None)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

def display_importances(feature_importance_df_):
    # Plot feature importances
    cols = feature_importance_df_[["feature", "importance"]].groupby("feature").mean().sort_values(
        by="importance", ascending=False)[:50].index
    
    best_features = feature_importance_df_.loc[feature_importance_df_.feature.isin(cols)]
    
    plt.figure(figsize=(8,10))
    sns.barplot(x="importance", y="feature", 
                data=best_features.sort_values(by="importance", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()


In [None]:
display_importances(feature_importance_df)

In [None]:
time.time() - st

In [None]:
feature_importances = feature_importance_df[["feature", "importance"]].groupby("feature").mean()

In [None]:
def norm_feature_importances(df, threshold = 0.95):
    
    # Sort features according to importance
    df = df.sort_values('importance', ascending = False).reset_index()
    
    # Normalize the feature importances to add up to one
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    df['cumulative_importance'] = np.cumsum(df['importance_normalized'])
    
    importance_index = np.min(np.where(df['cumulative_importance'] > threshold))
    print('%d features required for %0.2f of cumulative importance' % (importance_index + 1, threshold))
    
    return df

In [None]:
norm_feature_importances = plot_feature_importances(feature_importances)

In [None]:
# Threshold for cumulative importance
threshold = 0.95

# Extract the features to keep
unimportant_features = list(norm_feature_importances[norm_feature_importances['cumulative_importance'] >= threshold]['feature'])

In [None]:
df = pd.read_csv('./new_feature_by_kageyama_feature_matrix_noIDs_delmissing.csv')
df = df.drop(unimportant_features, axis=1)
df.to_csv('./new_feature_by_kageyama_feature_matrix_noIDs_delmissing_delunimportant.csv',  index=None)