## Model Test(Logistic regression, random forest, LightGBM)

In [24]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb
from sklearn.cross_validation import KFold

# grid search
from sklearn.model_selection import GridSearchCV

In [25]:
# import data

transaction_training = pd.read_csv('../raw_data/transactions_train.csv')
payment_training = pd.read_csv('../raw_data/paiements_train.csv')
billing_training = pd.read_csv('../raw_data/facturation_train.csv')
performance_training = pd.read_csv('../raw_data/performance_train.csv')


transaction_test = pd.read_csv('../raw_data/transactions_test.csv')
payment_test = pd.read_csv('../raw_data/paiements_test.csv')
billing_test = pd.read_csv('../raw_data/facturation_test.csv')
performance_test = pd.read_csv('../raw_data/performance_test.csv')

In [26]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [27]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [28]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [29]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    m = stats.mode(oof_test_skf, axis=1)
    oof_test[:] = m[0][0]
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       

In [30]:
# data preprocess

class DataPreprocess:
    def __init__(self, label_encoder):
        self.lbl = label_encoder
        
    def preprocess_transcation(self, transaction_df):
        categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 
                               'DECISION_XCD', 'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP']
        
        for col in categorical_columns:
            transaction_df[col].fillna('unknown')
            transaction_df[col] = self.lbl.fit_transform(transaction_df[col].astype(str))
        
        transaction_df = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
        transaction_df = transaction_df.reset_index()
        transaction_df = transaction_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
        transaction_df.columns = ['MERCHANT_CATEGORY_' + str(i) for i in transaction_df.columns]
        transaction_df = transaction_df.fillna(0)
        
        return transaction_df
    
    def preprocess_payment(self, payment_df):
        payment_df = payment_df.dropna()
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])
        payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])
        payment_df['PAYMENT_N_COUNT'] = payment_df['PAYMENT_REVERSAL_XFLG'] == 'N'
        
        payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])[['TRANSACTION_AMT', 'PAYMENT_N_COUNT']].sum().reset_index()
        payment_df = payment_df.groupby('ID_CPTE').tail(12)
        
        tmp = payment_df.groupby(['ID_CPTE'])['PAYMENT_N_COUNT'].sum().reset_index()
        
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])
        payment_df = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
        payment_df.columns = ['transaction_' + str(i) for i in payment_df.columns + '_month']
        payment_df = payment_df.reset_index()
        payment_df = payment_df.fillna(0)
        
        payment_df = payment_df.merge(tmp, on='ID_CPTE')
        
        return payment_df
    
    def preprocess_billing(self, billing_df):
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])
        billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
        billing_df = billing_df.reset_index(drop=True)
        billing_df = billing_df.groupby('ID_CPTE').tail(12)
        billing_df = billing_df.reset_index(drop=True)
        billing_df['CreditLeft'] = billing_df['CreditLimit'] - billing_df['CurrentTotalBalance']
        
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])
        credit_left = billing_df.pivot_table('CreditLeft', ['ID_CPTE'], 'PERIODID_MY')
        credit_left.columns = ['credit_left_' + str(i) for i in credit_left.columns + '_month']
        cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
        cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']
        
        delq_cycle = billing_df.groupby(['ID_CPTE'])['DelqCycle'].max().reset_index()
        delq_cycle = delq_cycle.rename(columns={'DelqCycle': 'MaxDelqCycle'})
        
        credit_left = credit_left.reset_index()
        cash_balance = cash_balance.reset_index()
        
        tmp = credit_left.merge(cash_balance, on='ID_CPTE')
        tmp = tmp.merge(credit_left, on='ID_CPTE')
        tmp = tmp.merge(delq_cycle, on='ID_CPTE')
        
        return tmp
    
    def merge(self, payment, billing):
        merge_df = payment.merge(billing, on='ID_CPTE', how='right')
        return merge_df.set_index(['ID_CPTE']) 

In [31]:
billing_test.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,71424379,2013-11-01,2013-11-04,1444.3,0.0,3200.0,0
1,71424379,2014-05-01,2014-04-30,785.89,0.0,3200.0,0
2,71424379,2014-08-01,2014-08-02,1095.48,0.0,3200.0,0
3,71424379,2014-04-01,2014-04-02,845.3,0.0,3200.0,0
4,71424379,2013-12-01,2013-11-30,1623.28,0.0,3200.0,0


In [32]:
label_encoder = preprocessing.LabelEncoder()

In [33]:
preprocess = DataPreprocess(label_encoder)

In [34]:
processed_payment = preprocess.preprocess_payment(payment_training)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [35]:
processed_billing = preprocess.preprocess_billing(billing_training)

In [36]:
processed_data = preprocess.merge(processed_payment, processed_billing)

In [37]:
transaction_col = processed_data.iloc[:, :12].columns

In [38]:
# deal with missing value in payment
for col in transaction_col:
    replace_value = processed_data[processed_data[col].notna()][col].mean()
    processed_data[col] = processed_data[col].fillna(replace_value)

In [39]:
processed_data = processed_data.fillna(0)

In [40]:
processed_data = processed_data.reset_index()

In [41]:
processed_data = processed_data.merge(performance_training[['ID_CPTE', 'Default']], on='ID_CPTE')

In [42]:
processed_data = processed_data.set_index('ID_CPTE')

In [43]:
X = np.array(processed_data.iloc[:, :-1])
y = np.array(processed_data.iloc[:, -1])

In [44]:
from imblearn.over_sampling import SMOTE

In [45]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_sample(X, y)

In [46]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_res, y_res, test_size=0.3, random_state=42)

In [47]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
    
clf = LogisticRegression()
rf = RandomForestClassifier(min_samples_split=20, max_depth=10, random_state=0)

In [48]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [49]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=20,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [50]:
predict_clf = clf.predict(X_test)

In [51]:
roc_auc_score(predict_clf, y_test)

0.7977130905782517

In [52]:
predict_rf = rf.predict(X_test)

In [53]:
roc_auc_score(predict_rf, y_test)

0.8664037282849164

In [54]:
importance = rf.feature_importances_
importance = pd.DataFrame(importance, index=processed_data.iloc[:, :-1].columns, columns=["Importance"])
importance.sort_values('Importance', ascending=False).head()

Unnamed: 0,Importance
MaxDelqCycle,0.142303
credit_left_11_month_y,0.060618
cash_balance_11_month,0.055523
credit_left_11_month_x,0.050324
cash_balance_12_month,0.048794


In [58]:
# LightGBM

In [55]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    #'max_depth': 15,
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.0175,
    'verbose': 0
}

In [56]:
lgtrain = lgb.Dataset(X_train, y_train)
lgtest = lgb.Dataset(X_test, y_test)

In [57]:
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=lgtrain,
    num_boost_round=2000,
)

[1]	training's binary_logloss: 0.682893
[2]	training's binary_logloss: 0.671216
[3]	training's binary_logloss: 0.661594
[4]	training's binary_logloss: 0.65065
[5]	training's binary_logloss: 0.640104
[6]	training's binary_logloss: 0.631517
[7]	training's binary_logloss: 0.622748
[8]	training's binary_logloss: 0.612933
[9]	training's binary_logloss: 0.60487
[10]	training's binary_logloss: 0.595346
[11]	training's binary_logloss: 0.587692
[12]	training's binary_logloss: 0.578727
[13]	training's binary_logloss: 0.571286
[14]	training's binary_logloss: 0.562753
[15]	training's binary_logloss: 0.555649
[16]	training's binary_logloss: 0.548742
[17]	training's binary_logloss: 0.540807
[18]	training's binary_logloss: 0.534487
[19]	training's binary_logloss: 0.527015
[20]	training's binary_logloss: 0.520935
[21]	training's binary_logloss: 0.513696
[22]	training's binary_logloss: 0.508035
[23]	training's binary_logloss: 0.502154
[24]	training's binary_logloss: 0.496548
[25]	training's binary_logl

[206]	training's binary_logloss: 0.111073
[207]	training's binary_logloss: 0.110285
[208]	training's binary_logloss: 0.109518
[209]	training's binary_logloss: 0.108761
[210]	training's binary_logloss: 0.108012
[211]	training's binary_logloss: 0.107351
[212]	training's binary_logloss: 0.106617
[213]	training's binary_logloss: 0.105888
[214]	training's binary_logloss: 0.10522
[215]	training's binary_logloss: 0.104504
[216]	training's binary_logloss: 0.103755
[217]	training's binary_logloss: 0.10307
[218]	training's binary_logloss: 0.10236
[219]	training's binary_logloss: 0.101676
[220]	training's binary_logloss: 0.10099
[221]	training's binary_logloss: 0.100294
[222]	training's binary_logloss: 0.0995811
[223]	training's binary_logloss: 0.0989187
[224]	training's binary_logloss: 0.0982798
[225]	training's binary_logloss: 0.0975806
[226]	training's binary_logloss: 0.0968974
[227]	training's binary_logloss: 0.0962409
[228]	training's binary_logloss: 0.0956048
[229]	training's binary_logloss

[406]	training's binary_logloss: 0.0308847
[407]	training's binary_logloss: 0.0306938
[408]	training's binary_logloss: 0.0305053
[409]	training's binary_logloss: 0.0303055
[410]	training's binary_logloss: 0.0301172
[411]	training's binary_logloss: 0.0299388
[412]	training's binary_logloss: 0.0297558
[413]	training's binary_logloss: 0.0295753
[414]	training's binary_logloss: 0.0293928
[415]	training's binary_logloss: 0.0292121
[416]	training's binary_logloss: 0.0290312
[417]	training's binary_logloss: 0.0288522
[418]	training's binary_logloss: 0.0286612
[419]	training's binary_logloss: 0.0284884
[420]	training's binary_logloss: 0.0283022
[421]	training's binary_logloss: 0.0281125
[422]	training's binary_logloss: 0.0279465
[423]	training's binary_logloss: 0.0277705
[424]	training's binary_logloss: 0.0276035
[425]	training's binary_logloss: 0.0274301
[426]	training's binary_logloss: 0.027266
[427]	training's binary_logloss: 0.0270958
[428]	training's binary_logloss: 0.0269181
[429]	traini

[597]	training's binary_logloss: 0.00965396
[598]	training's binary_logloss: 0.00959464
[599]	training's binary_logloss: 0.00953911
[600]	training's binary_logloss: 0.00948329
[601]	training's binary_logloss: 0.0094242
[602]	training's binary_logloss: 0.00937017
[603]	training's binary_logloss: 0.00931513
[604]	training's binary_logloss: 0.00926041
[605]	training's binary_logloss: 0.00920629
[606]	training's binary_logloss: 0.00915045
[607]	training's binary_logloss: 0.00909852
[608]	training's binary_logloss: 0.00904617
[609]	training's binary_logloss: 0.0089944
[610]	training's binary_logloss: 0.00893868
[611]	training's binary_logloss: 0.00888038
[612]	training's binary_logloss: 0.00882683
[613]	training's binary_logloss: 0.00877559
[614]	training's binary_logloss: 0.00872133
[615]	training's binary_logloss: 0.00867479
[616]	training's binary_logloss: 0.00862227
[617]	training's binary_logloss: 0.00857257
[618]	training's binary_logloss: 0.00852366
[619]	training's binary_logloss: 0

[790]	training's binary_logloss: 0.00301668
[791]	training's binary_logloss: 0.00299856
[792]	training's binary_logloss: 0.00298044
[793]	training's binary_logloss: 0.00296392
[794]	training's binary_logloss: 0.0029467
[795]	training's binary_logloss: 0.0029286
[796]	training's binary_logloss: 0.00291153
[797]	training's binary_logloss: 0.00289394
[798]	training's binary_logloss: 0.00287694
[799]	training's binary_logloss: 0.00286053
[800]	training's binary_logloss: 0.00284318
[801]	training's binary_logloss: 0.00282625
[802]	training's binary_logloss: 0.00280827
[803]	training's binary_logloss: 0.00279207
[804]	training's binary_logloss: 0.00277385
[805]	training's binary_logloss: 0.00275724
[806]	training's binary_logloss: 0.00274081
[807]	training's binary_logloss: 0.00272429
[808]	training's binary_logloss: 0.00270773
[809]	training's binary_logloss: 0.00269165
[810]	training's binary_logloss: 0.00267568
[811]	training's binary_logloss: 0.00265972
[812]	training's binary_logloss: 0

[981]	training's binary_logloss: 0.000968612
[982]	training's binary_logloss: 0.000963571
[983]	training's binary_logloss: 0.000957694
[984]	training's binary_logloss: 0.000952101
[985]	training's binary_logloss: 0.000946043
[986]	training's binary_logloss: 0.000940504
[987]	training's binary_logloss: 0.000935171
[988]	training's binary_logloss: 0.000929582
[989]	training's binary_logloss: 0.00092404
[990]	training's binary_logloss: 0.000918705
[991]	training's binary_logloss: 0.000913345
[992]	training's binary_logloss: 0.000907959
[993]	training's binary_logloss: 0.000902839
[994]	training's binary_logloss: 0.000897784
[995]	training's binary_logloss: 0.000892203
[996]	training's binary_logloss: 0.000886707
[997]	training's binary_logloss: 0.000881407
[998]	training's binary_logloss: 0.000875916
[999]	training's binary_logloss: 0.000871013
[1000]	training's binary_logloss: 0.000865796
[1001]	training's binary_logloss: 0.000860901
[1002]	training's binary_logloss: 0.000855719
[1003]	t

[1160]	training's binary_logloss: 0.000336882
[1161]	training's binary_logloss: 0.000334944
[1162]	training's binary_logloss: 0.00033291
[1163]	training's binary_logloss: 0.00033089
[1164]	training's binary_logloss: 0.000328949
[1165]	training's binary_logloss: 0.000326969
[1166]	training's binary_logloss: 0.000325075
[1167]	training's binary_logloss: 0.000323149
[1168]	training's binary_logloss: 0.000321161
[1169]	training's binary_logloss: 0.000319285
[1170]	training's binary_logloss: 0.000317332
[1171]	training's binary_logloss: 0.000315502
[1172]	training's binary_logloss: 0.000313693
[1173]	training's binary_logloss: 0.000311778
[1174]	training's binary_logloss: 0.000309919
[1175]	training's binary_logloss: 0.000308044
[1176]	training's binary_logloss: 0.000306236
[1177]	training's binary_logloss: 0.000304475
[1178]	training's binary_logloss: 0.00030258
[1179]	training's binary_logloss: 0.000300815
[1180]	training's binary_logloss: 0.000299014
[1181]	training's binary_logloss: 0.0

[1345]	training's binary_logloss: 0.000113561
[1346]	training's binary_logloss: 0.000112881
[1347]	training's binary_logloss: 0.000112244
[1348]	training's binary_logloss: 0.000111685
[1349]	training's binary_logloss: 0.000111089
[1350]	training's binary_logloss: 0.000110424
[1351]	training's binary_logloss: 0.000109774
[1352]	training's binary_logloss: 0.000109139
[1353]	training's binary_logloss: 0.0001085
[1354]	training's binary_logloss: 0.00010787
[1355]	training's binary_logloss: 0.000107235
[1356]	training's binary_logloss: 0.000106618
[1357]	training's binary_logloss: 0.000106035
[1358]	training's binary_logloss: 0.000105406
[1359]	training's binary_logloss: 0.000104786
[1360]	training's binary_logloss: 0.000104179
[1361]	training's binary_logloss: 0.000103567
[1362]	training's binary_logloss: 0.00010298
[1363]	training's binary_logloss: 0.000102415
[1364]	training's binary_logloss: 0.000101827
[1365]	training's binary_logloss: 0.000101267
[1366]	training's binary_logloss: 0.00

[1532]	training's binary_logloss: 4.07711e-05
[1533]	training's binary_logloss: 4.05661e-05
[1534]	training's binary_logloss: 4.03876e-05
[1535]	training's binary_logloss: 4.01735e-05
[1536]	training's binary_logloss: 3.99711e-05
[1537]	training's binary_logloss: 3.97678e-05
[1538]	training's binary_logloss: 3.95804e-05
[1539]	training's binary_logloss: 3.93737e-05
[1540]	training's binary_logloss: 3.91832e-05
[1541]	training's binary_logloss: 3.89889e-05
[1542]	training's binary_logloss: 3.88124e-05
[1543]	training's binary_logloss: 3.86338e-05
[1544]	training's binary_logloss: 3.84508e-05
[1545]	training's binary_logloss: 3.82699e-05
[1546]	training's binary_logloss: 3.80965e-05
[1547]	training's binary_logloss: 3.78848e-05
[1548]	training's binary_logloss: 3.7701e-05
[1549]	training's binary_logloss: 3.75123e-05
[1550]	training's binary_logloss: 3.73408e-05
[1551]	training's binary_logloss: 3.71417e-05
[1552]	training's binary_logloss: 3.6962e-05
[1553]	training's binary_logloss: 3.

[1711]	training's binary_logloss: 1.89808e-05
[1712]	training's binary_logloss: 1.8916e-05
[1713]	training's binary_logloss: 1.88541e-05
[1714]	training's binary_logloss: 1.87905e-05
[1715]	training's binary_logloss: 1.87271e-05
[1716]	training's binary_logloss: 1.86615e-05
[1717]	training's binary_logloss: 1.86059e-05
[1718]	training's binary_logloss: 1.8548e-05
[1719]	training's binary_logloss: 1.8489e-05
[1720]	training's binary_logloss: 1.84315e-05
[1721]	training's binary_logloss: 1.8373e-05
[1722]	training's binary_logloss: 1.83114e-05
[1723]	training's binary_logloss: 1.82486e-05
[1724]	training's binary_logloss: 1.81931e-05
[1725]	training's binary_logloss: 1.8136e-05
[1726]	training's binary_logloss: 1.80776e-05
[1727]	training's binary_logloss: 1.8024e-05
[1728]	training's binary_logloss: 1.79666e-05
[1729]	training's binary_logloss: 1.791e-05
[1730]	training's binary_logloss: 1.78482e-05
[1731]	training's binary_logloss: 1.77925e-05
[1732]	training's binary_logloss: 1.77359e

[1890]	training's binary_logloss: 1.1909e-05
[1891]	training's binary_logloss: 1.1887e-05
[1892]	training's binary_logloss: 1.18633e-05
[1893]	training's binary_logloss: 1.18337e-05
[1894]	training's binary_logloss: 1.18061e-05
[1895]	training's binary_logloss: 1.17805e-05
[1896]	training's binary_logloss: 1.1755e-05
[1897]	training's binary_logloss: 1.17286e-05
[1898]	training's binary_logloss: 1.17061e-05
[1899]	training's binary_logloss: 1.16836e-05
[1900]	training's binary_logloss: 1.16599e-05
[1901]	training's binary_logloss: 1.16363e-05
[1902]	training's binary_logloss: 1.16131e-05
[1903]	training's binary_logloss: 1.15896e-05
[1904]	training's binary_logloss: 1.15671e-05
[1905]	training's binary_logloss: 1.1545e-05
[1906]	training's binary_logloss: 1.15216e-05
[1907]	training's binary_logloss: 1.14999e-05
[1908]	training's binary_logloss: 1.14752e-05
[1909]	training's binary_logloss: 1.145e-05
[1910]	training's binary_logloss: 1.14267e-05
[1911]	training's binary_logloss: 1.1403

In [58]:
predict_lgb = lgb_clf.predict(X_test)

In [59]:
predict_lgb = np.array([0 if i < 0.6 else 1 for i in predict_lgb])

In [60]:
roc_auc_score(predict_lgb, y_test)

0.9226083661995378

### Grid Search, tune parameters

In [55]:
# Create parameters to search
gridParams = {
    'learning_rate': [0.0125, 0.0175, 0.0225],
    'n_estimators': [40],
    'num_leaves': [170, 220, 270, 320],
    #'max_depth': [15, 25, 35],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'feature_fraction': [0.4, 0.5, 0.6]
    }

In [56]:
# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', objective = 'binary')

In [57]:
grid = GridSearchCV(mdl, gridParams, verbose=0, cv=5, n_jobs=2)

In [58]:
grid.fit(X_res, y_res)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'learning_rate': [0.0125, 0.0175, 0.0225], 'n_estimators': [40], 'num_leaves': [170, 220, 270, 320], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'feature_fraction': [0.4, 0.5, 0.6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

{'boosting_type': 'gbdt', 'feature_fraction': 0.5, 'learning_rate': 0.0225, 'n_estimators': 40, 'num_leaves': 320, 'objective': 'binary'}
0.8844150432336702


In [91]:
# Using parameters already set above, replace in the best from the grid search

# params['max_bin'] = grid.best_params_['max_bin']
lgbm_params['feature_fraction'] = grid.best_params_['feature_fraction']
lgbm_params['learning_rate'] = grid.best_params_['learning_rate']
lgbm_params['num_leaves'] = grid.best_params_['num_leaves']
#lgbm_params['max_depth'] = grid.best_params_['max_depth']
#lgbm_params['reg_alpha'] = grid.best_params_['reg_alpha']
#lgbm_params['reg_lambda'] = grid.best_params_['reg_lambda']

In [70]:
print('Fitting with params: ')
print(lgbm_params)

Fitting with params: 
{'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 320, 'feature_fraction': 0.5, 'bagging_fraction': 0.75, 'bagging_freq': 2, 'learning_rate': 0.0225, 'verbose': 0}


In [92]:
lgtrain = lgb.Dataset(X_train, y_train)
lgtest = lgb.Dataset(X_test, y_test)

In [93]:
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=lgtrain,
    num_boost_round=2000
)

[1]	training's binary_logloss: 0.680222
[2]	training's binary_logloss: 0.665389
[3]	training's binary_logloss: 0.653191
[4]	training's binary_logloss: 0.639386
[5]	training's binary_logloss: 0.62627
[6]	training's binary_logloss: 0.615549
[7]	training's binary_logloss: 0.604853
[8]	training's binary_logloss: 0.59271
[9]	training's binary_logloss: 0.583045
[10]	training's binary_logloss: 0.571847
[11]	training's binary_logloss: 0.562659
[12]	training's binary_logloss: 0.552023
[13]	training's binary_logloss: 0.543252
[14]	training's binary_logloss: 0.5332
[15]	training's binary_logloss: 0.525082
[16]	training's binary_logloss: 0.516985
[17]	training's binary_logloss: 0.507895
[18]	training's binary_logloss: 0.500583
[19]	training's binary_logloss: 0.492051
[20]	training's binary_logloss: 0.485031
[21]	training's binary_logloss: 0.476843
[22]	training's binary_logloss: 0.470439
[23]	training's binary_logloss: 0.463842
[24]	training's binary_logloss: 0.457568
[25]	training's binary_loglos

[210]	training's binary_logloss: 0.0718775
[211]	training's binary_logloss: 0.0712573
[212]	training's binary_logloss: 0.0706457
[213]	training's binary_logloss: 0.0700746
[214]	training's binary_logloss: 0.0694923
[215]	training's binary_logloss: 0.0689093
[216]	training's binary_logloss: 0.0683074
[217]	training's binary_logloss: 0.0677445
[218]	training's binary_logloss: 0.0671402
[219]	training's binary_logloss: 0.0665853
[220]	training's binary_logloss: 0.0660362
[221]	training's binary_logloss: 0.0654867
[222]	training's binary_logloss: 0.0649203
[223]	training's binary_logloss: 0.0643821
[224]	training's binary_logloss: 0.0638353
[225]	training's binary_logloss: 0.0633037
[226]	training's binary_logloss: 0.0627686
[227]	training's binary_logloss: 0.062247
[228]	training's binary_logloss: 0.0616928
[229]	training's binary_logloss: 0.0611645
[230]	training's binary_logloss: 0.060674
[231]	training's binary_logloss: 0.060146
[232]	training's binary_logloss: 0.0596419
[233]	training

[407]	training's binary_logloss: 0.0142639
[408]	training's binary_logloss: 0.0141573
[409]	training's binary_logloss: 0.0140331
[410]	training's binary_logloss: 0.0139167
[411]	training's binary_logloss: 0.013798
[412]	training's binary_logloss: 0.013688
[413]	training's binary_logloss: 0.0135787
[414]	training's binary_logloss: 0.0134703
[415]	training's binary_logloss: 0.0133609
[416]	training's binary_logloss: 0.0132512
[417]	training's binary_logloss: 0.013148
[418]	training's binary_logloss: 0.0130366
[419]	training's binary_logloss: 0.0129306
[420]	training's binary_logloss: 0.0128261
[421]	training's binary_logloss: 0.0127246
[422]	training's binary_logloss: 0.0126299
[423]	training's binary_logloss: 0.0125329
[424]	training's binary_logloss: 0.0124376
[425]	training's binary_logloss: 0.0123354
[426]	training's binary_logloss: 0.0122491
[427]	training's binary_logloss: 0.0121527
[428]	training's binary_logloss: 0.0120527
[429]	training's binary_logloss: 0.0119477
[430]	training

[604]	training's binary_logloss: 0.00299644
[605]	training's binary_logloss: 0.00297256
[606]	training's binary_logloss: 0.00294822
[607]	training's binary_logloss: 0.00292443
[608]	training's binary_logloss: 0.0029017
[609]	training's binary_logloss: 0.00288082
[610]	training's binary_logloss: 0.0028574
[611]	training's binary_logloss: 0.00283461
[612]	training's binary_logloss: 0.00281261
[613]	training's binary_logloss: 0.00279116
[614]	training's binary_logloss: 0.00276892
[615]	training's binary_logloss: 0.00274794
[616]	training's binary_logloss: 0.00272591
[617]	training's binary_logloss: 0.00270472
[618]	training's binary_logloss: 0.00268229
[619]	training's binary_logloss: 0.00266158
[620]	training's binary_logloss: 0.00264038
[621]	training's binary_logloss: 0.00261928
[622]	training's binary_logloss: 0.00259988
[623]	training's binary_logloss: 0.00257971
[624]	training's binary_logloss: 0.00255949
[625]	training's binary_logloss: 0.00254063
[626]	training's binary_logloss: 0

[796]	training's binary_logloss: 0.000665436
[797]	training's binary_logloss: 0.000660075
[798]	training's binary_logloss: 0.00065479
[799]	training's binary_logloss: 0.000649751
[800]	training's binary_logloss: 0.000644324
[801]	training's binary_logloss: 0.00063918
[802]	training's binary_logloss: 0.000633717
[803]	training's binary_logloss: 0.000628787
[804]	training's binary_logloss: 0.00062438
[805]	training's binary_logloss: 0.000619466
[806]	training's binary_logloss: 0.000614507
[807]	training's binary_logloss: 0.000609592
[808]	training's binary_logloss: 0.000605237
[809]	training's binary_logloss: 0.000600439
[810]	training's binary_logloss: 0.000595942
[811]	training's binary_logloss: 0.000591371
[812]	training's binary_logloss: 0.000587394
[813]	training's binary_logloss: 0.000582484
[814]	training's binary_logloss: 0.000578136
[815]	training's binary_logloss: 0.000573411
[816]	training's binary_logloss: 0.000568867
[817]	training's binary_logloss: 0.000564542
[818]	trainin

[979]	training's binary_logloss: 0.000163644
[980]	training's binary_logloss: 0.00016237
[981]	training's binary_logloss: 0.000161089
[982]	training's binary_logloss: 0.000160084
[983]	training's binary_logloss: 0.00015887
[984]	training's binary_logloss: 0.000157668
[985]	training's binary_logloss: 0.000156381
[986]	training's binary_logloss: 0.000155366
[987]	training's binary_logloss: 0.000154374
[988]	training's binary_logloss: 0.000153286
[989]	training's binary_logloss: 0.000152041
[990]	training's binary_logloss: 0.000151092
[991]	training's binary_logloss: 0.000150107
[992]	training's binary_logloss: 0.000148887
[993]	training's binary_logloss: 0.000147745
[994]	training's binary_logloss: 0.000146639
[995]	training's binary_logloss: 0.000145609
[996]	training's binary_logloss: 0.000144471
[997]	training's binary_logloss: 0.000143481
[998]	training's binary_logloss: 0.000142394
[999]	training's binary_logloss: 0.000141364
[1000]	training's binary_logloss: 0.000140278
[1001]	trai

[1164]	training's binary_logloss: 4.16505e-05
[1165]	training's binary_logloss: 4.13743e-05
[1166]	training's binary_logloss: 4.10963e-05
[1167]	training's binary_logloss: 4.08226e-05
[1168]	training's binary_logloss: 4.05317e-05
[1169]	training's binary_logloss: 4.02618e-05
[1170]	training's binary_logloss: 3.99749e-05
[1171]	training's binary_logloss: 3.97036e-05
[1172]	training's binary_logloss: 3.94376e-05
[1173]	training's binary_logloss: 3.91676e-05
[1174]	training's binary_logloss: 3.89108e-05
[1175]	training's binary_logloss: 3.86341e-05
[1176]	training's binary_logloss: 3.83648e-05
[1177]	training's binary_logloss: 3.81288e-05
[1178]	training's binary_logloss: 3.78801e-05
[1179]	training's binary_logloss: 3.76234e-05
[1180]	training's binary_logloss: 3.73545e-05
[1181]	training's binary_logloss: 3.71047e-05
[1182]	training's binary_logloss: 3.68688e-05
[1183]	training's binary_logloss: 3.66272e-05
[1184]	training's binary_logloss: 3.63637e-05
[1185]	training's binary_logloss: 

[1344]	training's binary_logloss: 1.62183e-05
[1345]	training's binary_logloss: 1.6161e-05
[1346]	training's binary_logloss: 1.61064e-05
[1347]	training's binary_logloss: 1.605e-05
[1348]	training's binary_logloss: 1.59934e-05
[1349]	training's binary_logloss: 1.59328e-05
[1350]	training's binary_logloss: 1.58742e-05
[1351]	training's binary_logloss: 1.58222e-05
[1352]	training's binary_logloss: 1.57637e-05
[1353]	training's binary_logloss: 1.57048e-05
[1354]	training's binary_logloss: 1.56482e-05
[1355]	training's binary_logloss: 1.55935e-05
[1356]	training's binary_logloss: 1.55417e-05
[1357]	training's binary_logloss: 1.54888e-05
[1358]	training's binary_logloss: 1.54332e-05
[1359]	training's binary_logloss: 1.53762e-05
[1360]	training's binary_logloss: 1.53159e-05
[1361]	training's binary_logloss: 1.52665e-05
[1362]	training's binary_logloss: 1.52178e-05
[1363]	training's binary_logloss: 1.51688e-05
[1364]	training's binary_logloss: 1.51197e-05
[1365]	training's binary_logloss: 1.5

[1553]	training's binary_logloss: 9.25781e-06
[1554]	training's binary_logloss: 9.23756e-06
[1555]	training's binary_logloss: 9.22115e-06
[1556]	training's binary_logloss: 9.20403e-06
[1557]	training's binary_logloss: 9.18673e-06
[1558]	training's binary_logloss: 9.16882e-06
[1559]	training's binary_logloss: 9.15068e-06
[1560]	training's binary_logloss: 9.13266e-06
[1561]	training's binary_logloss: 9.11568e-06
[1562]	training's binary_logloss: 9.0989e-06
[1563]	training's binary_logloss: 9.07901e-06
[1564]	training's binary_logloss: 9.06201e-06
[1565]	training's binary_logloss: 9.0431e-06
[1566]	training's binary_logloss: 9.02718e-06
[1567]	training's binary_logloss: 9.01013e-06
[1568]	training's binary_logloss: 8.99377e-06
[1569]	training's binary_logloss: 8.97788e-06
[1570]	training's binary_logloss: 8.96065e-06
[1571]	training's binary_logloss: 8.94516e-06
[1572]	training's binary_logloss: 8.92878e-06
[1573]	training's binary_logloss: 8.91231e-06
[1574]	training's binary_logloss: 8.

[1772]	training's binary_logloss: 6.53492e-06
[1773]	training's binary_logloss: 6.52657e-06
[1774]	training's binary_logloss: 6.51756e-06
[1775]	training's binary_logloss: 6.50965e-06
[1776]	training's binary_logloss: 6.50126e-06
[1777]	training's binary_logloss: 6.49219e-06
[1778]	training's binary_logloss: 6.48339e-06
[1779]	training's binary_logloss: 6.47467e-06
[1780]	training's binary_logloss: 6.46675e-06
[1781]	training's binary_logloss: 6.45843e-06
[1782]	training's binary_logloss: 6.44942e-06
[1783]	training's binary_logloss: 6.44213e-06
[1784]	training's binary_logloss: 6.43393e-06
[1785]	training's binary_logloss: 6.42529e-06
[1786]	training's binary_logloss: 6.41752e-06
[1787]	training's binary_logloss: 6.40996e-06
[1788]	training's binary_logloss: 6.40191e-06
[1789]	training's binary_logloss: 6.39398e-06
[1790]	training's binary_logloss: 6.38634e-06
[1791]	training's binary_logloss: 6.37877e-06
[1792]	training's binary_logloss: 6.37166e-06
[1793]	training's binary_logloss: 

[1972]	training's binary_logloss: 5.24024e-06
[1973]	training's binary_logloss: 5.23445e-06
[1974]	training's binary_logloss: 5.2301e-06
[1975]	training's binary_logloss: 5.22441e-06
[1976]	training's binary_logloss: 5.21882e-06
[1977]	training's binary_logloss: 5.2137e-06
[1978]	training's binary_logloss: 5.2088e-06
[1979]	training's binary_logloss: 5.20424e-06
[1980]	training's binary_logloss: 5.19881e-06
[1981]	training's binary_logloss: 5.19479e-06
[1982]	training's binary_logloss: 5.18965e-06
[1983]	training's binary_logloss: 5.18484e-06
[1984]	training's binary_logloss: 5.18018e-06
[1985]	training's binary_logloss: 5.17597e-06
[1986]	training's binary_logloss: 5.17166e-06
[1987]	training's binary_logloss: 5.16702e-06
[1988]	training's binary_logloss: 5.16228e-06
[1989]	training's binary_logloss: 5.15773e-06
[1990]	training's binary_logloss: 5.15253e-06
[1991]	training's binary_logloss: 5.1477e-06
[1992]	training's binary_logloss: 5.14276e-06
[1993]	training's binary_logloss: 5.13

In [95]:
predict_lgb = lgb_clf.predict(X_test)
predict_lgb = np.array([0 if i < 0.6 else 1 for i in predict_lgb])
roc_auc_score(predict_lgb, y_test)

0.9227782792854905

In [32]:
from sklearn.externals import joblib
joblib.dump(clf, 'random_forest.pkl')

['random_forest.pkl']

In [96]:
# deploy model
lgtrain = lgb.Dataset(X_res, y_res)

final_lgb = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=lgtrain,
    num_boost_round=2000
)

[1]	training's binary_logloss: 0.679963
[2]	training's binary_logloss: 0.66497
[3]	training's binary_logloss: 0.652721
[4]	training's binary_logloss: 0.638789
[5]	training's binary_logloss: 0.625434
[6]	training's binary_logloss: 0.61472
[7]	training's binary_logloss: 0.603808
[8]	training's binary_logloss: 0.591721
[9]	training's binary_logloss: 0.581847
[10]	training's binary_logloss: 0.57042
[11]	training's binary_logloss: 0.561189
[12]	training's binary_logloss: 0.550441
[13]	training's binary_logloss: 0.541623
[14]	training's binary_logloss: 0.53153
[15]	training's binary_logloss: 0.52316
[16]	training's binary_logloss: 0.515254
[17]	training's binary_logloss: 0.505888
[18]	training's binary_logloss: 0.498422
[19]	training's binary_logloss: 0.489725
[20]	training's binary_logloss: 0.482526
[21]	training's binary_logloss: 0.47419
[22]	training's binary_logloss: 0.467509
[23]	training's binary_logloss: 0.460875
[24]	training's binary_logloss: 0.454472
[25]	training's binary_logloss:

[199]	training's binary_logloss: 0.0772638
[200]	training's binary_logloss: 0.0766379
[201]	training's binary_logloss: 0.0759774
[202]	training's binary_logloss: 0.0753103
[203]	training's binary_logloss: 0.0747256
[204]	training's binary_logloss: 0.0741488
[205]	training's binary_logloss: 0.0734999
[206]	training's binary_logloss: 0.0729097
[207]	training's binary_logloss: 0.07227
[208]	training's binary_logloss: 0.0715998
[209]	training's binary_logloss: 0.0709609
[210]	training's binary_logloss: 0.0703132
[211]	training's binary_logloss: 0.0697548
[212]	training's binary_logloss: 0.0691784
[213]	training's binary_logloss: 0.0685933
[214]	training's binary_logloss: 0.0680614
[215]	training's binary_logloss: 0.0674941
[216]	training's binary_logloss: 0.0669178
[217]	training's binary_logloss: 0.066328
[218]	training's binary_logloss: 0.0658082
[219]	training's binary_logloss: 0.0653198
[220]	training's binary_logloss: 0.0647656
[221]	training's binary_logloss: 0.0642212
[222]	training

[396]	training's binary_logloss: 0.0163086
[397]	training's binary_logloss: 0.0161884
[398]	training's binary_logloss: 0.0160625
[399]	training's binary_logloss: 0.0159455
[400]	training's binary_logloss: 0.0158206
[401]	training's binary_logloss: 0.0157
[402]	training's binary_logloss: 0.0155789
[403]	training's binary_logloss: 0.0154607
[404]	training's binary_logloss: 0.0153426
[405]	training's binary_logloss: 0.0152259
[406]	training's binary_logloss: 0.0151091
[407]	training's binary_logloss: 0.0149929
[408]	training's binary_logloss: 0.0148796
[409]	training's binary_logloss: 0.0147623
[410]	training's binary_logloss: 0.0146502
[411]	training's binary_logloss: 0.0145413
[412]	training's binary_logloss: 0.0144362
[413]	training's binary_logloss: 0.0143286
[414]	training's binary_logloss: 0.0142198
[415]	training's binary_logloss: 0.0141239
[416]	training's binary_logloss: 0.0140186
[417]	training's binary_logloss: 0.0139128
[418]	training's binary_logloss: 0.0138017
[419]	training

[586]	training's binary_logloss: 0.00391371
[587]	training's binary_logloss: 0.00388607
[588]	training's binary_logloss: 0.00385704
[589]	training's binary_logloss: 0.00382918
[590]	training's binary_logloss: 0.0038032
[591]	training's binary_logloss: 0.00377537
[592]	training's binary_logloss: 0.00374882
[593]	training's binary_logloss: 0.00372338
[594]	training's binary_logloss: 0.00369763
[595]	training's binary_logloss: 0.00367219
[596]	training's binary_logloss: 0.00364397
[597]	training's binary_logloss: 0.00361821
[598]	training's binary_logloss: 0.00359339
[599]	training's binary_logloss: 0.0035683
[600]	training's binary_logloss: 0.00354372
[601]	training's binary_logloss: 0.00351886
[602]	training's binary_logloss: 0.00349407
[603]	training's binary_logloss: 0.00346782
[604]	training's binary_logloss: 0.00344381
[605]	training's binary_logloss: 0.0034181
[606]	training's binary_logloss: 0.0033936
[607]	training's binary_logloss: 0.00337161
[608]	training's binary_logloss: 0.0

[778]	training's binary_logloss: 0.000917744
[779]	training's binary_logloss: 0.000911321
[780]	training's binary_logloss: 0.000904474
[781]	training's binary_logloss: 0.000897863
[782]	training's binary_logloss: 0.000891265
[783]	training's binary_logloss: 0.000884333
[784]	training's binary_logloss: 0.000877636
[785]	training's binary_logloss: 0.000871198
[786]	training's binary_logloss: 0.000864699
[787]	training's binary_logloss: 0.000857901
[788]	training's binary_logloss: 0.000851647
[789]	training's binary_logloss: 0.000845216
[790]	training's binary_logloss: 0.000838978
[791]	training's binary_logloss: 0.000832331
[792]	training's binary_logloss: 0.000826494
[793]	training's binary_logloss: 0.000820733
[794]	training's binary_logloss: 0.000814784
[795]	training's binary_logloss: 0.000808437
[796]	training's binary_logloss: 0.000802418
[797]	training's binary_logloss: 0.000796565
[798]	training's binary_logloss: 0.000790911
[799]	training's binary_logloss: 0.000784638
[800]	trai

[966]	training's binary_logloss: 0.000237133
[967]	training's binary_logloss: 0.000235747
[968]	training's binary_logloss: 0.000234227
[969]	training's binary_logloss: 0.000232402
[970]	training's binary_logloss: 0.000231023
[971]	training's binary_logloss: 0.000229196
[972]	training's binary_logloss: 0.000227612
[973]	training's binary_logloss: 0.00022639
[974]	training's binary_logloss: 0.000225063
[975]	training's binary_logloss: 0.000223813
[976]	training's binary_logloss: 0.000222086
[977]	training's binary_logloss: 0.000220834
[978]	training's binary_logloss: 0.000219167
[979]	training's binary_logloss: 0.000217519
[980]	training's binary_logloss: 0.000216224
[981]	training's binary_logloss: 0.000214986
[982]	training's binary_logloss: 0.000213927
[983]	training's binary_logloss: 0.000212762
[984]	training's binary_logloss: 0.000211161
[985]	training's binary_logloss: 0.000209501
[986]	training's binary_logloss: 0.000207771
[987]	training's binary_logloss: 0.000206628
[988]	train

[1151]	training's binary_logloss: 7.28559e-05
[1152]	training's binary_logloss: 7.2054e-05
[1153]	training's binary_logloss: 7.12271e-05
[1154]	training's binary_logloss: 7.04427e-05
[1155]	training's binary_logloss: 6.96573e-05
[1156]	training's binary_logloss: 6.8886e-05
[1157]	training's binary_logloss: 6.81261e-05
[1158]	training's binary_logloss: 6.74114e-05
[1159]	training's binary_logloss: 6.67075e-05
[1160]	training's binary_logloss: 6.60037e-05
[1161]	training's binary_logloss: 6.53071e-05
[1162]	training's binary_logloss: 6.46151e-05
[1163]	training's binary_logloss: 6.39462e-05
[1164]	training's binary_logloss: 6.32633e-05
[1165]	training's binary_logloss: 6.25969e-05
[1166]	training's binary_logloss: 6.19462e-05
[1167]	training's binary_logloss: 6.13293e-05
[1168]	training's binary_logloss: 6.06975e-05
[1169]	training's binary_logloss: 6.00733e-05
[1170]	training's binary_logloss: 5.94693e-05
[1171]	training's binary_logloss: 5.88608e-05
[1172]	training's binary_logloss: 5.

[1338]	training's binary_logloss: 1.86435e-05
[1339]	training's binary_logloss: 1.85559e-05
[1340]	training's binary_logloss: 1.84692e-05
[1341]	training's binary_logloss: 1.83826e-05
[1342]	training's binary_logloss: 1.83039e-05
[1343]	training's binary_logloss: 1.82201e-05
[1344]	training's binary_logloss: 1.81407e-05
[1345]	training's binary_logloss: 1.80676e-05
[1346]	training's binary_logloss: 1.79917e-05
[1347]	training's binary_logloss: 1.79126e-05
[1348]	training's binary_logloss: 1.7833e-05
[1349]	training's binary_logloss: 1.77569e-05
[1350]	training's binary_logloss: 1.7679e-05
[1351]	training's binary_logloss: 1.75983e-05
[1352]	training's binary_logloss: 1.75279e-05
[1353]	training's binary_logloss: 1.74535e-05
[1354]	training's binary_logloss: 1.73776e-05
[1355]	training's binary_logloss: 1.7304e-05
[1356]	training's binary_logloss: 1.7228e-05
[1357]	training's binary_logloss: 1.71516e-05
[1358]	training's binary_logloss: 1.70838e-05
[1359]	training's binary_logloss: 1.70

[1523]	training's binary_logloss: 1.01089e-05
[1524]	training's binary_logloss: 1.00826e-05
[1525]	training's binary_logloss: 1.00551e-05
[1526]	training's binary_logloss: 1.0032e-05
[1527]	training's binary_logloss: 1.00089e-05
[1528]	training's binary_logloss: 9.98494e-06
[1529]	training's binary_logloss: 9.961e-06
[1530]	training's binary_logloss: 9.93624e-06
[1531]	training's binary_logloss: 9.91411e-06
[1532]	training's binary_logloss: 9.89048e-06
[1533]	training's binary_logloss: 9.86731e-06
[1534]	training's binary_logloss: 9.84421e-06
[1535]	training's binary_logloss: 9.82192e-06
[1536]	training's binary_logloss: 9.79646e-06
[1537]	training's binary_logloss: 9.77435e-06
[1538]	training's binary_logloss: 9.75123e-06
[1539]	training's binary_logloss: 9.72864e-06
[1540]	training's binary_logloss: 9.70609e-06
[1541]	training's binary_logloss: 9.68156e-06
[1542]	training's binary_logloss: 9.65961e-06
[1543]	training's binary_logloss: 9.63833e-06
[1544]	training's binary_logloss: 9.6

[1703]	training's binary_logloss: 7.0865e-06
[1704]	training's binary_logloss: 7.07601e-06
[1705]	training's binary_logloss: 7.06373e-06
[1706]	training's binary_logloss: 7.05289e-06
[1707]	training's binary_logloss: 7.04251e-06
[1708]	training's binary_logloss: 7.03189e-06
[1709]	training's binary_logloss: 7.02024e-06
[1710]	training's binary_logloss: 7.00908e-06
[1711]	training's binary_logloss: 6.99807e-06
[1712]	training's binary_logloss: 6.98759e-06
[1713]	training's binary_logloss: 6.97678e-06
[1714]	training's binary_logloss: 6.96561e-06
[1715]	training's binary_logloss: 6.95475e-06
[1716]	training's binary_logloss: 6.94468e-06
[1717]	training's binary_logloss: 6.93301e-06
[1718]	training's binary_logloss: 6.92261e-06
[1719]	training's binary_logloss: 6.91172e-06
[1720]	training's binary_logloss: 6.90023e-06
[1721]	training's binary_logloss: 6.8906e-06
[1722]	training's binary_logloss: 6.88054e-06
[1723]	training's binary_logloss: 6.8698e-06
[1724]	training's binary_logloss: 6.8

[1885]	training's binary_logloss: 5.52485e-06
[1886]	training's binary_logloss: 5.51804e-06
[1887]	training's binary_logloss: 5.51168e-06
[1888]	training's binary_logloss: 5.50587e-06
[1889]	training's binary_logloss: 5.49901e-06
[1890]	training's binary_logloss: 5.49267e-06
[1891]	training's binary_logloss: 5.48657e-06
[1892]	training's binary_logloss: 5.47892e-06
[1893]	training's binary_logloss: 5.47216e-06
[1894]	training's binary_logloss: 5.466e-06
[1895]	training's binary_logloss: 5.4598e-06
[1896]	training's binary_logloss: 5.45356e-06
[1897]	training's binary_logloss: 5.44636e-06
[1898]	training's binary_logloss: 5.43952e-06
[1899]	training's binary_logloss: 5.43297e-06
[1900]	training's binary_logloss: 5.42741e-06
[1901]	training's binary_logloss: 5.42125e-06
[1902]	training's binary_logloss: 5.4146e-06
[1903]	training's binary_logloss: 5.40882e-06
[1904]	training's binary_logloss: 5.40225e-06
[1905]	training's binary_logloss: 5.39626e-06
[1906]	training's binary_logloss: 5.39

### Fit the final model to the testing data

In [61]:
processed_payment_test = preprocess.preprocess_payment(payment_test)
processed_billing_test = preprocess.preprocess_billing(billing_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [62]:
processed_test = preprocess.merge(processed_payment_test, processed_billing_test)

In [63]:
for col in transaction_col:
    replace_value = processed_test[processed_test[col].notna()][col].mean()
    processed_test[col] = processed_test[col].fillna(replace_value)

In [64]:
processed_test = processed_test.fillna(0)

In [65]:
processed_test.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,credit_left_04_month_y,credit_left_05_month_y,credit_left_06_month_y,credit_left_07_month_y,credit_left_08_month_y,credit_left_09_month_y,credit_left_10_month_y,credit_left_11_month_y,credit_left_12_month_y,MaxDelqCycle
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10074849,411.0,340.26,993.92,0.0,906.38,363.0,915.54,609.5,626.85,396.93,...,1082.89,1689.6,1355.5,1514.89,1691.76,1303.07,1254.59,1586.58,1237.53,0
10086539,0.0,556.92,832.0,642.0,661.26,1880.0,950.86,1591.0,1048.6,500.0,...,932.32,611.21,192.4,1309.15,845.2,601.42,394.9,866.74,865.76,0
10140908,214.1,88.36,200.0,206.0,239.99,160.5,418.0,428.72,202.0,163.2,...,66.75,388.0,305.1,616.1,726.48,942.6,917.25,543.06,8.44,2
10147994,38.11,39.52,218.4,227.9,224.7,229.69,226.0,504.29,4.24,256.2,...,1289.95,861.59,741.1,104.62,-52.0,-230.21,-374.16,158.0,-11.0,0
10152808,420.0,1030.0,510.0,423.3,877.5,1157.44,709.0,995.0,1015.0,515.0,...,800.69,571.52,775.0,957.0,1266.0,910.8,1132.0,674.48,818.62,0


In [66]:
X_test = np.array(processed_test)

In [67]:
X_test.shape

(5100, 50)

In [68]:
predict_lgb = lgb_clf.predict(X_test)
predict_lgb = np.array([0 if i < 0.6 else 1 for i in predict_lgb])

In [69]:
processed_test['Default'] = predict_lgb

In [70]:
results = processed_test.reset_index()[['ID_CPTE', 'Default']]

In [71]:
submission = pd.read_csv('../raw_data/performance_test.csv')

In [72]:
submission.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,71424379,2014-12-01,
1,64887111,2015-12-01,
2,69431075,2014-12-01,
3,31823308,2016-12-01,
4,39407834,2012-12-01,


In [73]:
results.head()

Unnamed: 0,ID_CPTE,Default
0,10074849,0
1,10086539,0
2,10140908,1
3,10147994,1
4,10152808,0


In [74]:
submission = submission[['ID_CPTE', 'Default']].merge(results, on='ID_CPTE')

In [75]:
submission = submission[['ID_CPTE', 'Default_y']]

In [76]:
submission = submission.rename(columns={'Default_y': 'Default'})

In [77]:
submission.head()

Unnamed: 0,ID_CPTE,Default
0,71424379,0
1,64887111,0
2,69431075,0
3,31823308,0
4,39407834,0


In [78]:
submission.to_csv('lgb_submission.csv', index_label=False)

In [79]:
tmp = pd.read_csv('submission.csv')

In [80]:
tmp.head()

Unnamed: 0,ID_CPTE,Default
0,71424379,0
1,64887111,0
2,69431075,0
3,31823308,0
4,39407834,0


In [168]:
len(results)

5100

In [169]:
performance_test.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,71424379,2014-12-01,
1,64887111,2015-12-01,
2,69431075,2014-12-01,
3,31823308,2016-12-01,
4,39407834,2012-12-01,
