In [1]:
import pandas as pd
import numpy as np
import sklearn as skl
import os 
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.decomposition import PCA
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
import plotly.graph_objs as go
import plotly.offline as py
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
import category_encoders as ce
from bayes_opt import BayesianOptimization
import featuretools as ft

# importing feature matrix from deep feature synthesis analysis

The feature matrix file comes from https://www.kaggle.com/willkoehrsen/automated-feature-engineering-basics/notebook
calculations to compute such a file using featuretools is very time consuming so I have directly downloaded it from the kernel


feature_matrix = pd.read_csv('feature_matrix_spec.csv')
test = feature_matrix[feature_matrix['set']== 'test']
train = feature_matrix[feature_matrix['set']=='train']

In [2]:
train_set = pd.read_csv('csv_files/feature_matrix_spec_train.csv', nrows=200000)

In [3]:
test_set = pd.read_csv('csv_files/feature_matrix_spec_test.csv')

train_set, test_set = skl.model_selection.train_test_split(train, test_size = 0.33)

# Feature selection

# Shaping the data
Useful functions

In [4]:
def missing_data(data) :
    """compute the number and the percentage of missing values per columns"""
    nb_missing_values = data.isnull().sum().sort_values(ascending = False) #count the number of missing values
    percent = (data.isnull().sum()/data.isnull().count()*100).sort_values(ascending = False) #percentage of missing values
    return pd.concat([nb_missing_values, percent], axis=1, keys=['total' , 'Percent'])

In [5]:
def fill_nan_mean(data,stat_data) :
    """Replace missing values by the mean of the column
    input : dataset to fill and informations about the missing values of the dataset from the missing_data funciton""" 
    
    mod_col = 0

    for row in range(stat_data.shape[0]):
        if stat_data.iloc[row,0] != 0:
            t = data[stat_data.index[row]].dtype#get the type of data
            mean = data[stat_data.index[row]].mean() #the mean value of the column
            mean.astype(t)
            data[stat_data.index[row]].fillna(mean, inplace=True)
            mod_col += 1
    print(str(mod_col)+' columns have been modified with a mean value instead of missing ones')

## Split features and targets set

In [6]:
train_target = train_set[['TARGET','SK_ID_CURR']]
train_set = train_set.drop(columns = ['set', 'TARGET', 'SK_ID_CURR'],axis=1)

In [7]:
test_set_features = test_set.drop(['SK_ID_CURR','set'],axis = 1)

test_target = test_set[['TARGET','SK_ID_CURR']]
test_set = test_set.drop(columns = ['set','SK_ID_CURR','TARGET'],axis=1)

## Shaping training and test set

Here we transform every categorical feature into dummy variables 
Then we fill missing values with the mean value of the column

In [None]:
train_set = pd.get_dummies(train_set)
stat_missing_values = missing_data(train_set)
fill_nan_mean(train_set, stat_missing_values)

In [9]:
test_set_features = pd.get_dummies(test_set_features)
stat_missing_values = missing_data(test_set_features)
fill_nan_mean(test_set_features, stat_missing_values)

789 columns have been modified with a mean value instead of missing ones


## Adjusting features in train and test set

We need to have the same features in the test and train set for the gradient boosting method to work properly
Here we look for features which are in a set and not in the other to remove them.

In [10]:
def missing_features(data1,data2):
    """spot features (columns) present in data1 and not in data2"""
    diff_features = []
    
    for col in data1:
        present=0
        for col2 in data2:
            if col == col2:
                present=1
        if present == 0 :
            diff_features.append(col)
    return(diff_features)

In [11]:
#spot columns present in training set and not in test set

diff_features_train = missing_features(train_set,test_set_features)
print(diff_features_train)

#remove missing columns
train_set = train_set.drop(diff_features_train, axis=1)

['NAME_INCOME_TYPE_Maternity leave', 'CODE_GENDER_XNA', 'NAME_FAMILY_STATUS_Unknown', 'MODE(bureau.CREDIT_CURRENCY)_currency 3', 'MODE(previous_app.NAME_CONTRACT_TYPE)_XNA', 'MODE(bureau.CREDIT_TYPE)_Loan for the purchase of equipment', 'MODE(bureau.CREDIT_TYPE)_Real estate loan', 'MODE(bureau_balance.STATUS)_3', 'MODE(previous_app.NAME_CASH_LOAN_PURPOSE)_Buying a garage', 'MODE(previous_app.NAME_CASH_LOAN_PURPOSE)_Money for a third person', 'MODE(previous_app.NAME_CASH_LOAN_PURPOSE)_Refusal to name the goal', 'MODE(previous_app.MODE(cash.NAME_CONTRACT_STATUS))_Approved', 'MODE(bureau.MODE(bureau_balance.STATUS))_2', 'MODE(bureau.MODE(bureau_balance.STATUS))_3']


In [12]:
#spot columns present in test set and not in training set

diff_features_test = missing_features(test_set_features,train_set)
print(diff_features_test)

#remove missing columns
test_set_features = test_set_features.drop(diff_features_test, axis = 1)

['TARGET', 'MODE(bureau.CREDIT_ACTIVE)_Bad debt', 'MODE(bureau.CREDIT_TYPE)_Cash loan (non-earmarked)']


# LGBM

First we look for optimal paramters for our lgb method using bayesian optimisation
Then we train two models : one with all features and a second with only half of the features which are the most important according to the first model

In [13]:
#from https://www.kaggle.com/nikitabu/bayes-optimization-of-lightgbm-with-deep-features
#train a lgb and return optimal parameters from cross validation

def lgb_evaluate(
                 learning_rate,
                 num_leaves,
                 min_split_gain,
                 max_depth,
                 subsample,
                 subsample_freq,
                 lambda_l1,
                 lambda_l2,
                 feature_fraction,
                ):

    clf = lgb.LGBMClassifier(num_leaves              = int(num_leaves),
                             max_depth               = int(max_depth),
                             learning_rate           = 10**learning_rate,
                             n_estimators            = 500,
                             min_split_gain          = min_split_gain,
                             subsample               = subsample,
                             colsample_bytree        = feature_fraction,
                             reg_alpha               = 10**lambda_l1,
                             reg_lambda              = 10**lambda_l2,
                             subsample_freq          = int(subsample_freq),
                             verbose                 = -1
                            )
    
    scores = cross_val_score(clf, train_set, train_target.drop(["SK_ID_CURR"],axis=1), cv=5, scoring='roc_auc')

    return np.mean(scores)

In [None]:
lgbBO = BayesianOptimization(lgb_evaluate, {
                                            'learning_rate':           (-2, 0),
                                            'num_leaves':              (5, 50),
                                            'min_split_gain':          (0, 1),
                                            'max_depth':               (5, 30),
                                            'subsample':               (0.1, 1),
                                            'subsample_freq':          (0, 100),
                                            'lambda_l1':               (-2, 2),
                                            'lambda_l2':               (-2, 2),
                                            'feature_fraction':        (0.1, 1)
                                            })

In [None]:
#search optimal paramters using bayesian optimization method

lgbBO.maximize(init_points=5, n_iter=5)

print(lgbBO.res['max'])

[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |   feature_fraction |   lambda_l1 |   lambda_l2 |   learning_rate |   max_depth |   min_split_gain |   num_leaves |   subsample |   subsample_freq | 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

    1 | 37m36s | [35m   0.75981[0m | [32m            0.8599[0m | [32m     0.3375[0m | [32m    -0.0928[0m | [32m        -1.8689[0m | [32m    11.8869[0m | [32m          0.6624[0m | [32m      8.2111[0m | [32m     0.1152[0m | [32m         75.8431[0m | 



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().


A column-vector y was passed when a 1d array was expec

In [None]:
#build a classifier with the previous parameters
clf = lgb.LGBMClassifier(num_leaves              = 44,
                         max_depth               = -1,
                         learning_rate           = 10**(-1.64),
                         n_estimators            = 2000,
                         min_split_gain          = 0.05,
                         subsample               = 0.85,
                         colsample_bytree        = 0.71,
                         reg_alpha               = 10**1.15,
                         reg_lambda              = 10**1.68,
                         subsample_freq            = 97
                        )

## First model with all paramters

In [None]:
#train the classifier with the training set
clf.fit(train_set, train_target.drop(['SK_ID_CURR'],axis=1), eval_metric='auc',verbose=1)

In [None]:
#get predictions with test set and store it in a dataframe
pred_proba = clf.predict_proba(test_set_features)
result3 = pd.DataFrame({'SK_ID_CURR' : test_set['SK_ID_CURR'],'TARGET' : [row[1] for row in pred_proba]})

In [None]:
#get most important features
feat_imp = pd.Series(clf.feature_importances_, index = train_set.columns)
feat_imp.nlargest(20).plot(kind ='barh',figsize=(8,10))

In [None]:
#result3.to_csv('predictions_without_selection.csv',index=False)

## Second model with the most important features

In [None]:
#spot less important features
not_important_features = feat_imp.nsmallest(600)

In [None]:
#compute new train and test sets
new_features_train = train_set.drop(not_important_features.index, axis=1)
new_features_test = test_set_features.drop(not_important_features.index, axis = 1)

In [None]:
#train the classifier with the training set of important features
clf.fit(new_features_train, train_target.drop(['SK_ID_CURR'],axis=1), eval_metric='auc',verbose=1)

In [None]:
#get predictions with test set and store it in a dataframe
pred_proba2 = clf.predict_proba(new_features_test)
result4 = pd.DataFrame({'SK_ID_CURR' : test_set['SK_ID_CURR'],'TARGET' : [row[1] for row in pred_proba2]})

In [None]:
#result4.to_csv('predictions_with_selection.csv',index=False)

the first model give us an accuarcy of 0.775 on the test set. 
the second one is slightly better with an accuracy of 0.777. He is also quicker to train.