In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from lightgbm import LGBMClassifier
import xgboost as xgb
from sklearn.metrics import auc, roc_curve
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.ensemble import RandomForestClassifier

sns.set()
%matplotlib inline

In [2]:
train = pd.read_csv('cs-training.csv')
test = pd.read_csv('cs-test.csv')

train = train.drop('Unnamed: 0', axis=1)
test = test.drop(['Unnamed: 0', 'SeriousDlqin2yrs'], axis=1)

In [3]:
train.head()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.65818,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.23381,30,0,0.03605,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0


In [4]:
train.isnull().sum()

SeriousDlqin2yrs                            0
RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           29731
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       3924
dtype: int64

In [5]:
test.isnull().sum()

RevolvingUtilizationOfUnsecuredLines        0
age                                         0
NumberOfTime30-59DaysPastDueNotWorse        0
DebtRatio                                   0
MonthlyIncome                           20103
NumberOfOpenCreditLinesAndLoans             0
NumberOfTimes90DaysLate                     0
NumberRealEstateLoansOrLines                0
NumberOfTime60-89DaysPastDueNotWorse        0
NumberOfDependents                       2626
dtype: int64

In [6]:
train['SeriousDlqin2yrs'].value_counts()

0    139974
1     10026
Name: SeriousDlqin2yrs, dtype: int64

# Fill missed values

In [7]:
def fillMedian(data, data2):
    cols = {}
    for i in data.columns:
        if(data[i].isnull().sum() > 0):
            cols[i] = np.nanmedian(pd.concat([data[i], data2[i]], axis=0))
    for i in cols.keys():
        data[i].fillna(cols[i], inplace=True)
        data2[i].fillna(cols[i], inplace=True)

In [8]:
fillMedian(train, test)

# Train models

In [9]:
X = train.drop('SeriousDlqin2yrs', axis=1)
y = train['SeriousDlqin2yrs']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.27, random_state=42)

In [11]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((109500, 10), (109500,), (40500, 10), (40500,))

# LGBMC

In [12]:
clfLGBM1 = LGBMClassifier(n_estimators=200, nthread=-1, seed=42)
clfLGBM2 = LGBMClassifier(n_estimators=180, nthread=-1, seed=42)

In [13]:
def scoring(clf, X, y):
    fpr, tpr, thresholds = roc_curve(y, clf.predict_proba(X)[:, 1])
    roc_auc = auc(fpr, tpr)
    return roc_auc

In [15]:
scores = cross_val_score(n_jobs=-1, cv=3, estimator=clfLGBM1, X=X, y=y, scoring=scoring)

In [16]:
scores

array([ 0.85944412,  0.86013434,  0.86433361])

In [17]:
clfLGBM1.fit(X, y)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=200, nthread=-1,
        num_leaves=31, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=42, silent=True, subsample=1, subsample_for_bin=50000,
        subsample_freq=1)

In [18]:
scoring(clfLGBM1, X, y)

0.89613813955548904

In [19]:
clfLGBM2.fit(X, y)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0, n_estimators=180, nthread=-1,
        num_leaves=31, objective='binary', reg_alpha=0, reg_lambda=0,
        seed=42, silent=True, subsample=1, subsample_for_bin=50000,
        subsample_freq=1)

In [22]:
scoring(clfLGBM2, X, y)

0.89384140983681759

# XGB

In [23]:
clfXGB = xgb.XGBClassifier(min_child_weight=10.0, n_estimators=250, nthread=-1,
objective='binary:logistic',
max_depth=5,
eval_metric='auc',
max_delta_step=1.8,
colsample_bytree=0.4,
subsample=0.8,
eta=0.025,
gamma=0.65,
num_boost_round=391, seed=42)

In [24]:
clfXGB2 = xgb.XGBClassifier(min_child_weight=10.0, n_estimators=330, nthread=-1,
objective='binary:logistic',
max_depth=5,
eval_metric='auc',
max_delta_step=1.8,
colsample_bytree=0.4,
subsample=0.8,
eta=0.025,
gamma=0.65,
num_boost_round=391, seed=42)

In [25]:
clfXGB.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, eta=0.025, eval_metric='auc', gamma=0.65,
       learning_rate=0.1, max_delta_step=1.8, max_depth=5,
       min_child_weight=10.0, missing=None, n_estimators=250, n_jobs=1,
       nthread=-1, num_boost_round=391, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=42, silent=True, subsample=0.8)

In [26]:
scoring(clfXGB, X, y)

0.88427627889150806

In [27]:
clfXGB2.fit(X, y)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.4, eta=0.025, eval_metric='auc', gamma=0.65,
       learning_rate=0.1, max_delta_step=1.8, max_depth=5,
       min_child_weight=10.0, missing=None, n_estimators=330, n_jobs=1,
       nthread=-1, num_boost_round=391, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=42, silent=True, subsample=0.8)

In [28]:
scoring(clfXGB2, X, y)

0.88709048986958006

In [29]:
clfRand = RandomForestClassifier(max_depth=7, max_features=0.5, criterion='entropy',
                                 n_estimators=160, n_jobs=-1, random_state=42)

In [30]:
clfRand2 = RandomForestClassifier(max_depth=7, max_features=0.5, criterion='entropy',
                                 n_estimators=200, n_jobs=-1, random_state=42)

In [31]:
b = cross_val_score(cv=5, estimator=clfRand, n_jobs=-1, scoring=scoring, X=X, y=y)

In [32]:
b

array([ 0.8580095 ,  0.85783322,  0.86162164,  0.86083172,  0.86750882])

In [33]:
clfRand.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=7, max_features=0.5, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=160, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

In [34]:
clfRand2.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=7, max_features=0.5, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=200, n_jobs=-1, oob_score=False, random_state=42,
            verbose=0, warm_start=False)

# Submission

In [35]:
predClfLGBM1 = clfLGBM1.predict_proba(test)[:, 1]
predClfLGBM2 = clfLGBM2.predict_proba(test)[:, 1]
predClfXGB = clfXGB.predict_proba(test)[:, 1]
predClfXGB2 = clfXGB2.predict_proba(test)[:, 1]
predClfRand = clfRand.predict_proba(test)[:, 1]
predClfRand2 = clfRand2.predict_proba(test)[:, 1]

In [36]:
pred = (predClfLGBM1 + 2*predClfLGBM2 + predClfXGB + predClfXGB2 + predClfRand + predClfRand2) / 7

In [37]:
s = pd.read_csv('sampleEntry.csv')

In [38]:
s['Probability'] = pred

In [39]:
# s.to_csv('ans.csv', index=False)