## Model Test(Logistic regression, random forest, LightGBM)

In [1]:
# import the relevant computational modules

# data manipulation
import pandas as pd #data processing
import numpy as np #linear algebra

# Models Packages
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn import feature_selection
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# Gradient Boosting
import lightgbm as lgb

# Basic Model
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Oversampling
from imblearn.over_sampling import SMOTE

# split dataset
from sklearn.model_selection import train_test_split
from sklearn.cross_validation import KFold

# grid search
from sklearn.model_selection import GridSearchCV


# visualization 
import matplotlib.pyplot as plt
%matplotlib inline



In [349]:
# import data

transaction_training = pd.read_csv('../raw_data/transactions_train.csv')
payment_training = pd.read_csv('../raw_data/paiements_train.csv')
billing_training = pd.read_csv('../raw_data/facturation_train.csv')
performance_training = pd.read_csv('../raw_data/performance_train.csv')


transaction_test = pd.read_csv('../raw_data/transactions_test.csv')
payment_test = pd.read_csv('../raw_data/paiements_test.csv')
billing_test = pd.read_csv('../raw_data/facturation_test.csv')
performance_test = pd.read_csv('../raw_data/performance_test.csv')

In [3]:
payment_training.head()

Unnamed: 0,ID_CPTE,TRANSACTION_AMT,TRANSACTION_DTTM,PAYMENT_REVERSAL_XFLG
0,99690111,208.0,2015-04-26 00:00:00,Q
1,99690111,176.8,2015-05-28 00:00:00,Q
2,99690111,200.0,2015-03-27 04:00:00,Q
3,99690111,80.8,2015-04-02 00:00:00,Q
4,99690111,250.0,2015-11-24 00:00:00,Q


In [4]:
billing_training.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle
0,99690111,2015-05-01,2015-05-03,8497.84,4293.12,16200.0,0
1,99690111,2014-11-01,2014-11-03,866.0,0.0,12000.0,0
2,99690111,2015-06-01,2015-05-31,10790.95,5224.44,16200.0,0
3,99690111,2015-10-01,2015-10-04,12388.46,4786.08,16200.0,0
4,99690111,2015-11-01,2015-11-02,12746.5,4818.48,16200.0,0


In [5]:
performance_training.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,99690111,2015-12-01,0
1,57427180,2012-12-01,0
2,29617912,2015-12-01,0
3,61632809,2015-12-01,0
4,14117855,2013-12-01,0


In [6]:
# Create basic scikit-learn wrapper model class
class SklearnWrapper:
    def __init__(self, clf, seed=0, params=None, seed_bool=True):
        if (seed_bool == True):
            params['random_state'] = seed
        self.clf = clf(**params)
    
    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)
        
    def predict(self, x):
        return self.clf.predict(x)

In [7]:
# create basic xgboost wrapper model class
class XgbWrapper:
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 250)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [8]:
# create basic lightGBM wrapper model class
class LightGbmWrapper:
    def __init(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1550)
        self.verbose_eval = params.pop('verbose_eval', 100)
        
    def train(self, x_train, y_train):
        lgtrain = lgb.Dataset(x_train, y_train)
        self.lgbm = lgb.train(self.param, lgtrain, num_boost_round=self.nrounds, verbose_eval=self.verbose_eval)
    
    def predict(self, x):
        return self.lgbm.predict(lgb.Dataset(x))

In [9]:
# create out-of-fold predictions 
# make good use of k-fold CV's result 
# serving for the staking alogrithm 
# create a new column generated from model's score

def get_oof(clf, x_train, y, x_test):
    '''
    clf: the classifer, which can be logistic regression, SVM regression, Bayes classifier, etc.
    x_train: the training x in training dataset
    y: the training y in training dataset
    x_test: the testing x in training dataset 
    '''
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    for i, (train_index, test_index) in enumerate(kf):
        print('\nFold {}'.format(i))
        x_tr = x_train[train_index]
        y_tr = y[train_index]
        x_te = x_train[test_index]
        
        clf.fit(x_tr, y_tr)
        
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
    
    m = stats.mode(oof_test_skf, axis=1)
    oof_test[:] = m[0][0]
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)       

In [367]:
# data preprocess

class DataPreprocess:
    def __init__(self, label_encoder):
        self.lbl = label_encoder
    
    def convert_date(self, statement_date, period_date):
        statement_day = statement_date.split('-')[-1]
        period_day = period_date.split('-')[-1]
        statement_month = statement_date.split('-')[-2]
        period_month = period_date.split('-')[-2]
        if int(statement_month) < int(period_month):
            tmp = 0
        else:
            if int(statement_day) > 20:
                tmp = 1
            else:
                tmp = 0
        return tmp
    
    
    def initialize_billing(self, billing_df):
        tmp = []
        for index, row in billing_df.iterrows():
            tmp.append(self.convert_date(row['StatementDate'], row['PERIODID_MY']))

        billing_df['statement_time'] = tmp
        
        return billing_df
    
    def preprocess_transcation(self, transaction_df):
        categorical_columns = ['MERCHANT_CATEGORY_XCD', 'MERCHANT_CITY_NAME', 'MERCHANT_COUNTRY_XCD', 
                               'DECISION_XCD', 'TRANSACTION_CATEGORY_XCD', 'TRANSACTION_TYPE_XCD', 'SICGROUP']
        
        for col in categorical_columns:
            transaction_df[col].fillna('unknown')
            transaction_df[col] = self.lbl.fit_transform(transaction_df[col].astype(str))
        
        transaction_df = transaction_df.groupby(['ID_CPTE', 'MERCHANT_CATEGORY_XCD'])['TRANSACTION_AMT'].sum()
        transaction_df = transaction_df.reset_index()
        transaction_df = transaction_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'MERCHANT_CATEGORY_XCD')
        transaction_df.columns = ['MERCHANT_CATEGORY_' + str(i) for i in transaction_df.columns]
        transaction_df = transaction_df.fillna(0)
        
        return transaction_df
    
    def preprocess_payment(self, payment_df):
        payment_df = payment_df.dropna()
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: str(x).split(' ')[0][:-3])
        payment_df = payment_df.sort_values(['ID_CPTE', 'TRANSACTION_DTTM'])
        payment_df['PAYMENT_N_COUNT'] = payment_df['PAYMENT_REVERSAL_XFLG'] == 'N'
        
        payment_df = payment_df.groupby(['ID_CPTE', 'TRANSACTION_DTTM'])[['TRANSACTION_AMT', 'PAYMENT_N_COUNT']].sum().reset_index()
        payment_df = payment_df.groupby('ID_CPTE').tail(12)
        
        tmp = payment_df.groupby(['ID_CPTE'])['PAYMENT_N_COUNT'].sum().reset_index()
        
        payment_df['TRANSACTION_DTTM'] = payment_df['TRANSACTION_DTTM'].apply(lambda x: x.split('-')[1])
        payment_df = payment_df.pivot_table('TRANSACTION_AMT', ['ID_CPTE'], 'TRANSACTION_DTTM')
        payment_df.columns = ['transaction_' + str(i) for i in payment_df.columns + '_month']
        payment_df = payment_df.reset_index()
        payment_df = payment_df.fillna(0)
        
        payment_df = payment_df.merge(tmp, on='ID_CPTE')
        
        return payment_df
    
    def preprocess_billing(self, billing_df):
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[:-3])
        billing_df = billing_df.sort_values(['ID_CPTE', 'PERIODID_MY'])
        billing_df = billing_df.reset_index(drop=True)
        billing_df = billing_df.groupby('ID_CPTE').tail(12)
        billing_df = billing_df.reset_index(drop=True)
        billing_df['CreditLeft'] = billing_df['CreditLimit'] - billing_df['CurrentTotalBalance']
        
        billing_df['PERIODID_MY'] = billing_df['PERIODID_MY'].apply(lambda x: x[-2:])
        credit_left = billing_df.pivot_table('CreditLeft', ['ID_CPTE'], 'PERIODID_MY')
        credit_left.columns = ['credit_left_' + str(i) for i in credit_left.columns + '_month']
        cash_balance = billing_df.pivot_table('CashBalance', ['ID_CPTE'], 'PERIODID_MY')
        cash_balance.columns = ['cash_balance_' + str(i) for i in cash_balance.columns + '_month']
        
        delq_cycle_avg = billing_df.groupby(['ID_CPTE'])['DelqCycle'].mean().reset_index()
        delq_cycle_avg = delq_cycle_avg.rename(columns={'DelqCycle': 'AvgDelqCycle'})
        
        delq_cycle = billing_df.groupby(['ID_CPTE'])['DelqCycle'].max().reset_index()
        delq_cycle = delq_cycle.rename(columns={'DelqCycle': 'MaxDelqCycle'})
        
        late_count = billing_df.groupby(['ID_CPTE'])['statement_time'].sum().reset_index()
        late_count = late_count.rename(columns={'statement_time': 'LateCount'})
        
        credit_limit_avg = billing_df.groupby(['ID_CPTE'])['CreditLimit'].mean().reset_index()
        credit_limit_avg = credit_limit_avg.rename(columns={'CreditLimit': 'CreditLimitAvg'})
        
        credit_limit_min = billing_df.groupby(['ID_CPTE'])['CreditLimit'].min().reset_index()
        credit_limit_min = credit_limit_min.rename(columns={'CreditLimit': 'CreditLimitMin'})
        
        credit_limit_max = billing_df.groupby(['ID_CPTE'])['CreditLimit'].max().reset_index()
        credit_limit_max = credit_limit_max.rename(columns={'CreditLimit': 'CreditLimitMax'})
        
        tmp1 = billing_df.groupby(['ID_CPTE'])[['ID_CPTE', 'CreditLimit']].head(1).set_index('ID_CPTE')
        tmp2 = billing_df.groupby(['ID_CPTE'])[['ID_CPTE', 'CreditLimit']].tail(1).set_index('ID_CPTE')
        credit_change = tmp2 - tmp1
        credit_change = credit_change.reset_index()
        credit_change = credit_change.rename(columns={'CreditLimit': 'CreditChange'})
        
        credit_left = credit_left.reset_index()
        cash_balance = cash_balance.reset_index()
        
        tmp = credit_left.merge(cash_balance, on='ID_CPTE')
        tmp = tmp.merge(credit_limit_avg, on='ID_CPTE')
        tmp = tmp.merge(credit_limit_min, on='ID_CPTE')
        tmp = tmp.merge(credit_limit_max, on='ID_CPTE')
        tmp = tmp.merge(credit_change, on='ID_CPTE')
        
        tmp = tmp.merge(delq_cycle, on='ID_CPTE')
        tmp = tmp.merge(delq_cycle_avg, on='ID_CPTE')
        tmp = tmp.merge(late_count, on='ID_CPTE')
        
        return tmp
    
    def merge(self, payment, billing):
        merge_df = payment.merge(billing, on='ID_CPTE', how='right')
        return merge_df.set_index(['ID_CPTE']) 

In [354]:
billing_training.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,StatementDate,CurrentTotalBalance,CashBalance,CreditLimit,DelqCycle,statement_time
0,99690111,2015-05,2015-05-03,8497.84,4293.12,16200.0,0,0
1,99690111,2014-11,2014-11-03,866.0,0.0,12000.0,0,0
2,99690111,2015-06,2015-05-31,10790.95,5224.44,16200.0,0,0
3,99690111,2015-10,2015-10-04,12388.46,4786.08,16200.0,0,0
4,99690111,2015-11,2015-11-02,12746.5,4818.48,16200.0,0,0


In [340]:
tmp1 = billing_test.groupby(['ID_CPTE'])[['ID_CPTE', 'CreditLimit']].head(1).set_index('ID_CPTE')
tmp2 = billing_test.groupby(['ID_CPTE'])[['ID_CPTE', 'CreditLimit']].tail(1).set_index('ID_CPTE')

In [364]:
tmp_ = tmp2 - tmp1

In [366]:
tmp_.reset_index().rename(columns={'CreditLimit': 'CreditChange'})

Unnamed: 0,ID_CPTE,CreditChange
0,71424379,0.0
1,64887111,0.0
2,69431075,0.0
3,31823308,0.0
4,39407834,0.0
5,94576690,0.0
6,35602796,0.0
7,94837853,0.0
8,84623445,0.0
9,62980143,0.0


In [334]:
label_encoder = preprocessing.LabelEncoder()

In [360]:
preprocess = DataPreprocess(label_encoder)

In [350]:
billing_training = preprocess.initialize_billing(billing_training)

In [351]:
processed_payment = preprocess.preprocess_payment(payment_training)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [361]:
processed_billing = preprocess.preprocess_billing(billing_training)

In [362]:
processed_billing.head()

Unnamed: 0,credit_left_2_month,cash_balance_2_month,CreditLimitAvg,CreditLimitMin,CreditLimitMax,CreditChange,MaxDelqCycle,AvgDelqCycle,ID_CPTE,LateCount


In [323]:
processed_data = preprocess.merge(processed_payment, processed_billing)

In [324]:
transaction_col = processed_data.iloc[:, :12].columns

In [325]:
# deal with missing value in payment
for col in transaction_col:
    replace_value = processed_data[processed_data[col].notna()][col].mean()
    processed_data[col] = processed_data[col].fillna(replace_value)

In [326]:
processed_data = processed_data.fillna(0)

In [327]:
processed_data = processed_data.reset_index()

In [328]:
processed_data = processed_data.merge(performance_training[['ID_CPTE', 'Default']], on='ID_CPTE')

In [329]:
processed_data = processed_data.set_index('ID_CPTE')

In [330]:
processed_data.head(20)

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,cash_balance_11_month,cash_balance_12_month,CreditLimitAvg,CreditLimitMin,CreditLimitMax,CreditChange,MaxDelqCycle,AvgDelqCycle,LateCount,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


### Features correlation and distribution analysis

In [221]:
processed_data.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,cash_balance_10_month,cash_balance_11_month,cash_balance_12_month,CreditLimitAvg,CreditLimitMin,CreditLimitMax,MaxDelqCycle,AvgDelqCycle,LateCount,Default
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10001822,318.0,522.5,374.5,4200.0,262.5,265.0,267.5,300.0,250.0,231.75,...,0.0,101.0,0.0,11500.0,11500.0,11500.0,0,0.0,0,0
10007972,784.34,168.28,1050.0,559.5,664.0,313.5,843.71,191.9,945.9,701.47,...,0.0,0.0,0.0,700.0,700.0,700.0,0,0.0,1,0
10012520,0.0,86.1,458.0,1177.31,315.0,525.0,505.0,50.5,1115.0,612.0,...,849.66,1224.0,777.65,2533.333333,2200.0,3200.0,0,0.0,0,0
10025534,131.3,0.0,260.0,0.0,6264.0,0.0,2080.0,0.0,318.0,0.0,...,0.0,0.0,0.0,6100.0,6100.0,6100.0,1,0.416667,12,1
10033579,574.55,391.3,412.23,470.53,546.8,419.61,283.92,106.0,84.0,219.6,...,0.0,0.0,0.0,500.0,500.0,500.0,1,0.083333,0,0


In [222]:
processed_data.shape

(11900, 44)

In [223]:
X = np.array(processed_data.iloc[:, :-1])
y = np.array(processed_data.iloc[:, -1])

In [224]:
from sklearn.preprocessing import normalize

In [225]:
X_tmp = normalize(X[:, :-3])

In [226]:
X = np.hstack((X_tmp, X[:, -3:]))

### Model test

In [227]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=103)

In [280]:
sm = SMOTE(random_state=42, ratio=1)
X_res, y_res = sm.fit_sample(X_train, y_train)



In [228]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
 
clf = LogisticRegression()
rf = RandomForestClassifier(min_samples_split=200, max_depth=20, random_state=0)

In [229]:
clf.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [230]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=200,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [231]:
predict_clf = clf.predict(X_test)

In [232]:
roc_auc_score(predict_clf, y_test)

0.7747653837193327

In [233]:
predict_rf = rf.predict(X_test)

In [234]:
roc_auc_score(predict_rf, y_test)

0.7740196078431373

In [239]:
importance = rf.feature_importances_
importance = pd.DataFrame(importance, index=processed_data.iloc[:, :-1].columns, columns=["Importance"])
importance.sort_values('Importance', ascending=False)

Unnamed: 0,Importance
credit_left_11_month,0.168163
credit_left_12_month,0.118133
credit_left_10_month,0.092951
credit_left_08_month,0.076884
AvgDelqCycle,0.076745
credit_left_09_month,0.075158
credit_left_04_month,0.066963
MaxDelqCycle,0.047234
credit_left_06_month,0.04325
transaction_12_month,0.030504


In [286]:
features_test = importance.reset_index().sort_values(['Importance'], ascending=False)['index'][:14].values

In [287]:
X = np.array(processed_data[features_test])
y = np.array(processed_data['Default'])

In [288]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=103)

In [289]:
clf = LogisticRegression()
rf = RandomForestClassifier(min_samples_split=200, max_depth=20, random_state=0)

In [290]:
clf.fit(X_train, y_train)
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=200,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [291]:
predict_clf = clf.predict(X_test)
roc_auc_score(predict_clf, y_test)

0.7941357704862216

In [292]:
predict_rf = rf.predict(X_test)
roc_auc_score(predict_rf, y_test)

0.7950622543686302

In [75]:
# LightGBM

In [76]:
lgbm_params =  {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    #'max_depth': 15,
    'num_leaves': 270,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.75,
    'bagging_freq': 2,
    'learning_rate': 0.0175,
    'verbose': 0
}

In [77]:
lgtrain = lgb.Dataset(X_train, y_train)
lgtest = lgb.Dataset(X_test, y_test)

In [119]:
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=lgtrain,
    num_boost_round=2000,
)

[1]	training's binary_logloss: 0.483921
[2]	training's binary_logloss: 0.47423
[3]	training's binary_logloss: 0.465425
[4]	training's binary_logloss: 0.45699
[5]	training's binary_logloss: 0.449553
[6]	training's binary_logloss: 0.441479
[7]	training's binary_logloss: 0.433863
[8]	training's binary_logloss: 0.426356
[9]	training's binary_logloss: 0.419756
[10]	training's binary_logloss: 0.413428
[11]	training's binary_logloss: 0.407148
[12]	training's binary_logloss: 0.401119
[13]	training's binary_logloss: 0.395703
[14]	training's binary_logloss: 0.390192
[15]	training's binary_logloss: 0.385123
[16]	training's binary_logloss: 0.379749
[17]	training's binary_logloss: 0.374884
[18]	training's binary_logloss: 0.370104
[19]	training's binary_logloss: 0.365283
[20]	training's binary_logloss: 0.360606
[21]	training's binary_logloss: 0.356278
[22]	training's binary_logloss: 0.35178
[23]	training's binary_logloss: 0.34769
[24]	training's binary_logloss: 0.343617
[25]	training's binary_loglos

[217]	training's binary_logloss: 0.0798726
[218]	training's binary_logloss: 0.0793293
[219]	training's binary_logloss: 0.0788381
[220]	training's binary_logloss: 0.0783138
[221]	training's binary_logloss: 0.0777764
[222]	training's binary_logloss: 0.077262
[223]	training's binary_logloss: 0.0767359
[224]	training's binary_logloss: 0.0762566
[225]	training's binary_logloss: 0.0757493
[226]	training's binary_logloss: 0.0752602
[227]	training's binary_logloss: 0.0747924
[228]	training's binary_logloss: 0.0743064
[229]	training's binary_logloss: 0.0738308
[230]	training's binary_logloss: 0.0733236
[231]	training's binary_logloss: 0.0728383
[232]	training's binary_logloss: 0.0723516
[233]	training's binary_logloss: 0.0718944
[234]	training's binary_logloss: 0.071447
[235]	training's binary_logloss: 0.0709896
[236]	training's binary_logloss: 0.0705045
[237]	training's binary_logloss: 0.0700671
[238]	training's binary_logloss: 0.0696074
[239]	training's binary_logloss: 0.0691581
[240]	trainin

[422]	training's binary_logloss: 0.0218415
[423]	training's binary_logloss: 0.0217076
[424]	training's binary_logloss: 0.0215794
[425]	training's binary_logloss: 0.0214538
[426]	training's binary_logloss: 0.0213401
[427]	training's binary_logloss: 0.0212038
[428]	training's binary_logloss: 0.0210724
[429]	training's binary_logloss: 0.0209343
[430]	training's binary_logloss: 0.0207948
[431]	training's binary_logloss: 0.0206777
[432]	training's binary_logloss: 0.0205529
[433]	training's binary_logloss: 0.0204317
[434]	training's binary_logloss: 0.0202993
[435]	training's binary_logloss: 0.0201688
[436]	training's binary_logloss: 0.0200414
[437]	training's binary_logloss: 0.0199326
[438]	training's binary_logloss: 0.0198185
[439]	training's binary_logloss: 0.0196926
[440]	training's binary_logloss: 0.0195745
[441]	training's binary_logloss: 0.0194529
[442]	training's binary_logloss: 0.0193428
[443]	training's binary_logloss: 0.0192264
[444]	training's binary_logloss: 0.0191097
[445]	train

[614]	training's binary_logloss: 0.0070722
[615]	training's binary_logloss: 0.00702785
[616]	training's binary_logloss: 0.00698927
[617]	training's binary_logloss: 0.00695216
[618]	training's binary_logloss: 0.00691558
[619]	training's binary_logloss: 0.00687379
[620]	training's binary_logloss: 0.00682668
[621]	training's binary_logloss: 0.00678841
[622]	training's binary_logloss: 0.00674601
[623]	training's binary_logloss: 0.00670685
[624]	training's binary_logloss: 0.00666821
[625]	training's binary_logloss: 0.00662948
[626]	training's binary_logloss: 0.00658736
[627]	training's binary_logloss: 0.00654766
[628]	training's binary_logloss: 0.00651091
[629]	training's binary_logloss: 0.00646586
[630]	training's binary_logloss: 0.00643026
[631]	training's binary_logloss: 0.00639167
[632]	training's binary_logloss: 0.00635977
[633]	training's binary_logloss: 0.00632338
[634]	training's binary_logloss: 0.006289
[635]	training's binary_logloss: 0.00624829
[636]	training's binary_logloss: 0.

[811]	training's binary_logloss: 0.00230331
[812]	training's binary_logloss: 0.00229155
[813]	training's binary_logloss: 0.00228093
[814]	training's binary_logloss: 0.00226763
[815]	training's binary_logloss: 0.00225562
[816]	training's binary_logloss: 0.00224411
[817]	training's binary_logloss: 0.00223347
[818]	training's binary_logloss: 0.00222154
[819]	training's binary_logloss: 0.00220878
[820]	training's binary_logloss: 0.00220002
[821]	training's binary_logloss: 0.00218629
[822]	training's binary_logloss: 0.00217443
[823]	training's binary_logloss: 0.00216013
[824]	training's binary_logloss: 0.0021465
[825]	training's binary_logloss: 0.00213531
[826]	training's binary_logloss: 0.00212401
[827]	training's binary_logloss: 0.00211153
[828]	training's binary_logloss: 0.00209901
[829]	training's binary_logloss: 0.00208591
[830]	training's binary_logloss: 0.00207619
[831]	training's binary_logloss: 0.00206427
[832]	training's binary_logloss: 0.0020509
[833]	training's binary_logloss: 0

[1021]	training's binary_logloss: 0.000704837
[1022]	training's binary_logloss: 0.000700978
[1023]	training's binary_logloss: 0.000697095
[1024]	training's binary_logloss: 0.000693944
[1025]	training's binary_logloss: 0.000689583
[1026]	training's binary_logloss: 0.000686081
[1027]	training's binary_logloss: 0.000682515
[1028]	training's binary_logloss: 0.000679164
[1029]	training's binary_logloss: 0.000674972
[1030]	training's binary_logloss: 0.000671144
[1031]	training's binary_logloss: 0.000668002
[1032]	training's binary_logloss: 0.000664017
[1033]	training's binary_logloss: 0.000661339
[1034]	training's binary_logloss: 0.000657808
[1035]	training's binary_logloss: 0.000654455
[1036]	training's binary_logloss: 0.000651213
[1037]	training's binary_logloss: 0.000646696
[1038]	training's binary_logloss: 0.000642739
[1039]	training's binary_logloss: 0.000639192
[1040]	training's binary_logloss: 0.000635799
[1041]	training's binary_logloss: 0.000632057
[1042]	training's binary_logloss: 

[1217]	training's binary_logloss: 0.000226349
[1218]	training's binary_logloss: 0.000224876
[1219]	training's binary_logloss: 0.000223841
[1220]	training's binary_logloss: 0.000222698
[1221]	training's binary_logloss: 0.000221445
[1222]	training's binary_logloss: 0.000220322
[1223]	training's binary_logloss: 0.000219257
[1224]	training's binary_logloss: 0.000217975
[1225]	training's binary_logloss: 0.000216707
[1226]	training's binary_logloss: 0.000215453
[1227]	training's binary_logloss: 0.00021417
[1228]	training's binary_logloss: 0.000213147
[1229]	training's binary_logloss: 0.00021215
[1230]	training's binary_logloss: 0.000211007
[1231]	training's binary_logloss: 0.000209548
[1232]	training's binary_logloss: 0.000208191
[1233]	training's binary_logloss: 0.000206903
[1234]	training's binary_logloss: 0.000205691
[1235]	training's binary_logloss: 0.000204331
[1236]	training's binary_logloss: 0.000203209
[1237]	training's binary_logloss: 0.000202052
[1238]	training's binary_logloss: 0.

[1413]	training's binary_logloss: 7.53527e-05
[1414]	training's binary_logloss: 7.49939e-05
[1415]	training's binary_logloss: 7.46025e-05
[1416]	training's binary_logloss: 7.42137e-05
[1417]	training's binary_logloss: 7.38108e-05
[1418]	training's binary_logloss: 7.33782e-05
[1419]	training's binary_logloss: 7.30081e-05
[1420]	training's binary_logloss: 7.26269e-05
[1421]	training's binary_logloss: 7.22448e-05
[1422]	training's binary_logloss: 7.18732e-05
[1423]	training's binary_logloss: 7.14708e-05
[1424]	training's binary_logloss: 7.11167e-05
[1425]	training's binary_logloss: 7.07089e-05
[1426]	training's binary_logloss: 7.03736e-05
[1427]	training's binary_logloss: 6.9979e-05
[1428]	training's binary_logloss: 6.96185e-05
[1429]	training's binary_logloss: 6.92715e-05
[1430]	training's binary_logloss: 6.89348e-05
[1431]	training's binary_logloss: 6.86006e-05
[1432]	training's binary_logloss: 6.82339e-05
[1433]	training's binary_logloss: 6.78668e-05
[1434]	training's binary_logloss: 6

[1636]	training's binary_logloss: 2.61192e-05
[1637]	training's binary_logloss: 2.60025e-05
[1638]	training's binary_logloss: 2.58853e-05
[1639]	training's binary_logloss: 2.57791e-05
[1640]	training's binary_logloss: 2.56637e-05
[1641]	training's binary_logloss: 2.55582e-05
[1642]	training's binary_logloss: 2.54468e-05
[1643]	training's binary_logloss: 2.53401e-05
[1644]	training's binary_logloss: 2.52375e-05
[1645]	training's binary_logloss: 2.5133e-05
[1646]	training's binary_logloss: 2.50298e-05
[1647]	training's binary_logloss: 2.49357e-05
[1648]	training's binary_logloss: 2.48416e-05
[1649]	training's binary_logloss: 2.47426e-05
[1650]	training's binary_logloss: 2.46404e-05
[1651]	training's binary_logloss: 2.4536e-05
[1652]	training's binary_logloss: 2.44429e-05
[1653]	training's binary_logloss: 2.43509e-05
[1654]	training's binary_logloss: 2.42655e-05
[1655]	training's binary_logloss: 2.41641e-05
[1656]	training's binary_logloss: 2.4071e-05
[1657]	training's binary_logloss: 2.3

[1870]	training's binary_logloss: 1.24342e-05
[1871]	training's binary_logloss: 1.24039e-05
[1872]	training's binary_logloss: 1.23791e-05
[1873]	training's binary_logloss: 1.23518e-05
[1874]	training's binary_logloss: 1.23231e-05
[1875]	training's binary_logloss: 1.22991e-05
[1876]	training's binary_logloss: 1.22736e-05
[1877]	training's binary_logloss: 1.22464e-05
[1878]	training's binary_logloss: 1.22221e-05
[1879]	training's binary_logloss: 1.21976e-05
[1880]	training's binary_logloss: 1.21701e-05
[1881]	training's binary_logloss: 1.21434e-05
[1882]	training's binary_logloss: 1.21202e-05
[1883]	training's binary_logloss: 1.20954e-05
[1884]	training's binary_logloss: 1.20623e-05
[1885]	training's binary_logloss: 1.20342e-05
[1886]	training's binary_logloss: 1.20074e-05
[1887]	training's binary_logloss: 1.19826e-05
[1888]	training's binary_logloss: 1.1954e-05
[1889]	training's binary_logloss: 1.19308e-05
[1890]	training's binary_logloss: 1.19068e-05
[1891]	training's binary_logloss: 1

In [120]:
predict_lgb = lgb_clf.predict(X_test)

In [121]:
predict_lgb = np.array([0 if i < 0.6 else 1 for i in predict_lgb])

In [122]:
roc_auc_score(predict_lgb, y_test)

0.7988009557998792

In [160]:
# CV based auroc score
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf, X, y, cv=10)

In [161]:
scores.mean()

0.8596627858253821

### Grid Search, tune parameters

In [125]:
# Create parameters to search
gridParams = {
    'learning_rate': [0.0125, 0.0175, 0.0225],
    'n_estimators': [40],
    'num_leaves': [170, 220, 270, 320],
    #'max_depth': [15, 25, 35],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'feature_fraction': [0.4, 0.5, 0.6]
    }

In [56]:
# Create classifier to use. Note that parameters have to be input manually
# not as a dict!
mdl = lgb.LGBMClassifier(boosting_type= 'gbdt', objective = 'binary')

In [57]:
grid = GridSearchCV(mdl, gridParams, verbose=0, cv=5, n_jobs=2)

In [58]:
grid.fit(X_res, y_res)

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


GridSearchCV(cv=5, error_score='raise',
       estimator=LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective='binary', random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0),
       fit_params=None, iid=True, n_jobs=2,
       param_grid={'learning_rate': [0.0125, 0.0175, 0.0225], 'n_estimators': [40], 'num_leaves': [170, 220, 270, 320], 'boosting_type': ['gbdt'], 'objective': ['binary'], 'feature_fraction': [0.4, 0.5, 0.6]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
# Print the best parameters found
print(grid.best_params_)
print(grid.best_score_)

{'boosting_type': 'gbdt', 'feature_fraction': 0.5, 'learning_rate': 0.0225, 'n_estimators': 40, 'num_leaves': 320, 'objective': 'binary'}
0.8844150432336702


In [91]:
# Using parameters already set above, replace in the best from the grid search

# params['max_bin'] = grid.best_params_['max_bin']
lgbm_params['feature_fraction'] = grid.best_params_['feature_fraction']
lgbm_params['learning_rate'] = grid.best_params_['learning_rate']
lgbm_params['num_leaves'] = grid.best_params_['num_leaves']
#lgbm_params['max_depth'] = grid.best_params_['max_depth']
#lgbm_params['reg_alpha'] = grid.best_params_['reg_alpha']
#lgbm_params['reg_lambda'] = grid.best_params_['reg_lambda']

In [70]:
print('Fitting with params: ')
print(lgbm_params)

Fitting with params: 
{'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 320, 'feature_fraction': 0.5, 'bagging_fraction': 0.75, 'bagging_freq': 2, 'learning_rate': 0.0225, 'verbose': 0}


In [126]:
lgbm_params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'binary_logloss', 'num_leaves': 320, 'feature_fraction': 0.5, 'bagging_fraction': 0.75, 'bagging_freq': 2, 'learning_rate': 0.0225, 'verbose': 0}

In [127]:
lgtrain = lgb.Dataset(X_train, y_train)
lgtest = lgb.Dataset(X_test, y_test)

In [128]:
lgb_clf = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=lgtrain,
    num_boost_round=2000
)

[1]	training's binary_logloss: 0.48389
[2]	training's binary_logloss: 0.474278
[3]	training's binary_logloss: 0.465419
[4]	training's binary_logloss: 0.457028
[5]	training's binary_logloss: 0.449707
[6]	training's binary_logloss: 0.441757
[7]	training's binary_logloss: 0.434344
[8]	training's binary_logloss: 0.426919
[9]	training's binary_logloss: 0.420335
[10]	training's binary_logloss: 0.413895
[11]	training's binary_logloss: 0.407566
[12]	training's binary_logloss: 0.40167
[13]	training's binary_logloss: 0.39614
[14]	training's binary_logloss: 0.390756
[15]	training's binary_logloss: 0.385582
[16]	training's binary_logloss: 0.38014
[17]	training's binary_logloss: 0.375172
[18]	training's binary_logloss: 0.370514
[19]	training's binary_logloss: 0.365811
[20]	training's binary_logloss: 0.361176
[21]	training's binary_logloss: 0.356688
[22]	training's binary_logloss: 0.352116
[23]	training's binary_logloss: 0.34796
[24]	training's binary_logloss: 0.343785
[25]	training's binary_logloss

[220]	training's binary_logloss: 0.078174
[221]	training's binary_logloss: 0.0776206
[222]	training's binary_logloss: 0.0770574
[223]	training's binary_logloss: 0.0765112
[224]	training's binary_logloss: 0.0760052
[225]	training's binary_logloss: 0.0754996
[226]	training's binary_logloss: 0.0749912
[227]	training's binary_logloss: 0.0744813
[228]	training's binary_logloss: 0.0739605
[229]	training's binary_logloss: 0.0735121
[230]	training's binary_logloss: 0.0730239
[231]	training's binary_logloss: 0.0725395
[232]	training's binary_logloss: 0.0720216
[233]	training's binary_logloss: 0.0715387
[234]	training's binary_logloss: 0.0710778
[235]	training's binary_logloss: 0.0706146
[236]	training's binary_logloss: 0.0701231
[237]	training's binary_logloss: 0.0696934
[238]	training's binary_logloss: 0.0691961
[239]	training's binary_logloss: 0.0687162
[240]	training's binary_logloss: 0.0682545
[241]	training's binary_logloss: 0.0678161
[242]	training's binary_logloss: 0.0673949
[243]	traini

[422]	training's binary_logloss: 0.0215401
[423]	training's binary_logloss: 0.0214098
[424]	training's binary_logloss: 0.0212868
[425]	training's binary_logloss: 0.0211618
[426]	training's binary_logloss: 0.0210383
[427]	training's binary_logloss: 0.0209066
[428]	training's binary_logloss: 0.0207839
[429]	training's binary_logloss: 0.0206506
[430]	training's binary_logloss: 0.0205182
[431]	training's binary_logloss: 0.0203889
[432]	training's binary_logloss: 0.0202745
[433]	training's binary_logloss: 0.020153
[434]	training's binary_logloss: 0.020022
[435]	training's binary_logloss: 0.0199043
[436]	training's binary_logloss: 0.0197824
[437]	training's binary_logloss: 0.0196756
[438]	training's binary_logloss: 0.0195644
[439]	training's binary_logloss: 0.0194422
[440]	training's binary_logloss: 0.0193272
[441]	training's binary_logloss: 0.0192142
[442]	training's binary_logloss: 0.01912
[443]	training's binary_logloss: 0.0189857
[444]	training's binary_logloss: 0.0188637
[445]	training'

[630]	training's binary_logloss: 0.00621669
[631]	training's binary_logloss: 0.006182
[632]	training's binary_logloss: 0.0061465
[633]	training's binary_logloss: 0.00610969
[634]	training's binary_logloss: 0.0060752
[635]	training's binary_logloss: 0.00603915
[636]	training's binary_logloss: 0.00600874
[637]	training's binary_logloss: 0.00597243
[638]	training's binary_logloss: 0.00593535
[639]	training's binary_logloss: 0.00589821
[640]	training's binary_logloss: 0.00586767
[641]	training's binary_logloss: 0.00583341
[642]	training's binary_logloss: 0.00580063
[643]	training's binary_logloss: 0.00575977
[644]	training's binary_logloss: 0.00572943
[645]	training's binary_logloss: 0.00569473
[646]	training's binary_logloss: 0.00566273
[647]	training's binary_logloss: 0.00562896
[648]	training's binary_logloss: 0.00559706
[649]	training's binary_logloss: 0.00556078
[650]	training's binary_logloss: 0.00553222
[651]	training's binary_logloss: 0.00549751
[652]	training's binary_logloss: 0.0

[837]	training's binary_logloss: 0.00186615
[838]	training's binary_logloss: 0.00185577
[839]	training's binary_logloss: 0.00184575
[840]	training's binary_logloss: 0.00183529
[841]	training's binary_logloss: 0.00182382
[842]	training's binary_logloss: 0.00181351
[843]	training's binary_logloss: 0.00180279
[844]	training's binary_logloss: 0.00179381
[845]	training's binary_logloss: 0.00178375
[846]	training's binary_logloss: 0.00177296
[847]	training's binary_logloss: 0.00176383
[848]	training's binary_logloss: 0.00175437
[849]	training's binary_logloss: 0.00174424
[850]	training's binary_logloss: 0.00173306
[851]	training's binary_logloss: 0.00172428
[852]	training's binary_logloss: 0.00171539
[853]	training's binary_logloss: 0.00170646
[854]	training's binary_logloss: 0.0016959
[855]	training's binary_logloss: 0.0016873
[856]	training's binary_logloss: 0.00167714
[857]	training's binary_logloss: 0.00166795
[858]	training's binary_logloss: 0.00165821
[859]	training's binary_logloss: 0

[1025]	training's binary_logloss: 0.000653633
[1026]	training's binary_logloss: 0.000649204
[1027]	training's binary_logloss: 0.000645639
[1028]	training's binary_logloss: 0.000641737
[1029]	training's binary_logloss: 0.000638342
[1030]	training's binary_logloss: 0.000635344
[1031]	training's binary_logloss: 0.000631368
[1032]	training's binary_logloss: 0.00062728
[1033]	training's binary_logloss: 0.000624224
[1034]	training's binary_logloss: 0.000621362
[1035]	training's binary_logloss: 0.000618327
[1036]	training's binary_logloss: 0.000615863
[1037]	training's binary_logloss: 0.000613209
[1038]	training's binary_logloss: 0.000609447
[1039]	training's binary_logloss: 0.000605924
[1040]	training's binary_logloss: 0.00060314
[1041]	training's binary_logloss: 0.000599547
[1042]	training's binary_logloss: 0.000597547
[1043]	training's binary_logloss: 0.000594629
[1044]	training's binary_logloss: 0.000591481
[1045]	training's binary_logloss: 0.000588076
[1046]	training's binary_logloss: 0.

[1207]	training's binary_logloss: 0.000226022
[1208]	training's binary_logloss: 0.000224856
[1209]	training's binary_logloss: 0.000223187
[1210]	training's binary_logloss: 0.000221392
[1211]	training's binary_logloss: 0.000220283
[1212]	training's binary_logloss: 0.000219598
[1213]	training's binary_logloss: 0.000218094
[1214]	training's binary_logloss: 0.000216936
[1215]	training's binary_logloss: 0.000214951
[1216]	training's binary_logloss: 0.000213478
[1217]	training's binary_logloss: 0.000211924
[1218]	training's binary_logloss: 0.00021031
[1219]	training's binary_logloss: 0.000209056
[1220]	training's binary_logloss: 0.000208011
[1221]	training's binary_logloss: 0.000207135
[1222]	training's binary_logloss: 0.00020636
[1223]	training's binary_logloss: 0.000205743
[1224]	training's binary_logloss: 0.000204816
[1225]	training's binary_logloss: 0.000203463
[1226]	training's binary_logloss: 0.000202777
[1227]	training's binary_logloss: 0.000201697
[1228]	training's binary_logloss: 0.

[1402]	training's binary_logloss: 6.99636e-05
[1403]	training's binary_logloss: 6.95481e-05
[1404]	training's binary_logloss: 6.91949e-05
[1405]	training's binary_logloss: 6.88043e-05
[1406]	training's binary_logloss: 6.8381e-05
[1407]	training's binary_logloss: 6.80552e-05
[1408]	training's binary_logloss: 6.76914e-05
[1409]	training's binary_logloss: 6.7334e-05
[1410]	training's binary_logloss: 6.69918e-05
[1411]	training's binary_logloss: 6.66696e-05
[1412]	training's binary_logloss: 6.629e-05
[1413]	training's binary_logloss: 6.59239e-05
[1414]	training's binary_logloss: 6.55501e-05
[1415]	training's binary_logloss: 6.51702e-05
[1416]	training's binary_logloss: 6.48149e-05
[1417]	training's binary_logloss: 6.44228e-05
[1418]	training's binary_logloss: 6.40813e-05
[1419]	training's binary_logloss: 6.37522e-05
[1420]	training's binary_logloss: 6.34546e-05
[1421]	training's binary_logloss: 6.31226e-05
[1422]	training's binary_logloss: 6.28474e-05
[1423]	training's binary_logloss: 6.24

[1586]	training's binary_logloss: 2.85442e-05
[1587]	training's binary_logloss: 2.84278e-05
[1588]	training's binary_logloss: 2.83204e-05
[1589]	training's binary_logloss: 2.82165e-05
[1590]	training's binary_logloss: 2.80916e-05
[1591]	training's binary_logloss: 2.79647e-05
[1592]	training's binary_logloss: 2.78469e-05
[1593]	training's binary_logloss: 2.77338e-05
[1594]	training's binary_logloss: 2.76058e-05
[1595]	training's binary_logloss: 2.74874e-05
[1596]	training's binary_logloss: 2.73679e-05
[1597]	training's binary_logloss: 2.7242e-05
[1598]	training's binary_logloss: 2.71284e-05
[1599]	training's binary_logloss: 2.70151e-05
[1600]	training's binary_logloss: 2.6904e-05
[1601]	training's binary_logloss: 2.67999e-05
[1602]	training's binary_logloss: 2.66951e-05
[1603]	training's binary_logloss: 2.65822e-05
[1604]	training's binary_logloss: 2.64844e-05
[1605]	training's binary_logloss: 2.63776e-05
[1606]	training's binary_logloss: 2.62792e-05
[1607]	training's binary_logloss: 2.

[1828]	training's binary_logloss: 1.30811e-05
[1829]	training's binary_logloss: 1.30507e-05
[1830]	training's binary_logloss: 1.30215e-05
[1831]	training's binary_logloss: 1.29946e-05
[1832]	training's binary_logloss: 1.29687e-05
[1833]	training's binary_logloss: 1.29378e-05
[1834]	training's binary_logloss: 1.29093e-05
[1835]	training's binary_logloss: 1.28829e-05
[1836]	training's binary_logloss: 1.28548e-05
[1837]	training's binary_logloss: 1.28232e-05
[1838]	training's binary_logloss: 1.27929e-05
[1839]	training's binary_logloss: 1.27612e-05
[1840]	training's binary_logloss: 1.27349e-05
[1841]	training's binary_logloss: 1.27091e-05
[1842]	training's binary_logloss: 1.26818e-05
[1843]	training's binary_logloss: 1.26575e-05
[1844]	training's binary_logloss: 1.263e-05
[1845]	training's binary_logloss: 1.26017e-05
[1846]	training's binary_logloss: 1.25739e-05
[1847]	training's binary_logloss: 1.25449e-05
[1848]	training's binary_logloss: 1.252e-05
[1849]	training's binary_logloss: 1.24

In [129]:
predict_lgb = lgb_clf.predict(X_test)
predict_lgb = np.array([0 if i < 0.6 else 1 for i in predict_lgb])
roc_auc_score(predict_lgb, y_test)

0.7884536364881524

In [32]:
from sklearn.externals import joblib
joblib.dump(clf, 'random_forest.pkl')

['random_forest.pkl']

In [96]:
# deploy model
lgtrain = lgb.Dataset(X_res, y_res)

final_lgb = lgb.train(
    lgbm_params,
    lgtrain,
    valid_sets=lgtrain,
    num_boost_round=2000
)

[1]	training's binary_logloss: 0.679963
[2]	training's binary_logloss: 0.66497
[3]	training's binary_logloss: 0.652721
[4]	training's binary_logloss: 0.638789
[5]	training's binary_logloss: 0.625434
[6]	training's binary_logloss: 0.61472
[7]	training's binary_logloss: 0.603808
[8]	training's binary_logloss: 0.591721
[9]	training's binary_logloss: 0.581847
[10]	training's binary_logloss: 0.57042
[11]	training's binary_logloss: 0.561189
[12]	training's binary_logloss: 0.550441
[13]	training's binary_logloss: 0.541623
[14]	training's binary_logloss: 0.53153
[15]	training's binary_logloss: 0.52316
[16]	training's binary_logloss: 0.515254
[17]	training's binary_logloss: 0.505888
[18]	training's binary_logloss: 0.498422
[19]	training's binary_logloss: 0.489725
[20]	training's binary_logloss: 0.482526
[21]	training's binary_logloss: 0.47419
[22]	training's binary_logloss: 0.467509
[23]	training's binary_logloss: 0.460875
[24]	training's binary_logloss: 0.454472
[25]	training's binary_logloss:

[199]	training's binary_logloss: 0.0772638
[200]	training's binary_logloss: 0.0766379
[201]	training's binary_logloss: 0.0759774
[202]	training's binary_logloss: 0.0753103
[203]	training's binary_logloss: 0.0747256
[204]	training's binary_logloss: 0.0741488
[205]	training's binary_logloss: 0.0734999
[206]	training's binary_logloss: 0.0729097
[207]	training's binary_logloss: 0.07227
[208]	training's binary_logloss: 0.0715998
[209]	training's binary_logloss: 0.0709609
[210]	training's binary_logloss: 0.0703132
[211]	training's binary_logloss: 0.0697548
[212]	training's binary_logloss: 0.0691784
[213]	training's binary_logloss: 0.0685933
[214]	training's binary_logloss: 0.0680614
[215]	training's binary_logloss: 0.0674941
[216]	training's binary_logloss: 0.0669178
[217]	training's binary_logloss: 0.066328
[218]	training's binary_logloss: 0.0658082
[219]	training's binary_logloss: 0.0653198
[220]	training's binary_logloss: 0.0647656
[221]	training's binary_logloss: 0.0642212
[222]	training

[396]	training's binary_logloss: 0.0163086
[397]	training's binary_logloss: 0.0161884
[398]	training's binary_logloss: 0.0160625
[399]	training's binary_logloss: 0.0159455
[400]	training's binary_logloss: 0.0158206
[401]	training's binary_logloss: 0.0157
[402]	training's binary_logloss: 0.0155789
[403]	training's binary_logloss: 0.0154607
[404]	training's binary_logloss: 0.0153426
[405]	training's binary_logloss: 0.0152259
[406]	training's binary_logloss: 0.0151091
[407]	training's binary_logloss: 0.0149929
[408]	training's binary_logloss: 0.0148796
[409]	training's binary_logloss: 0.0147623
[410]	training's binary_logloss: 0.0146502
[411]	training's binary_logloss: 0.0145413
[412]	training's binary_logloss: 0.0144362
[413]	training's binary_logloss: 0.0143286
[414]	training's binary_logloss: 0.0142198
[415]	training's binary_logloss: 0.0141239
[416]	training's binary_logloss: 0.0140186
[417]	training's binary_logloss: 0.0139128
[418]	training's binary_logloss: 0.0138017
[419]	training

[586]	training's binary_logloss: 0.00391371
[587]	training's binary_logloss: 0.00388607
[588]	training's binary_logloss: 0.00385704
[589]	training's binary_logloss: 0.00382918
[590]	training's binary_logloss: 0.0038032
[591]	training's binary_logloss: 0.00377537
[592]	training's binary_logloss: 0.00374882
[593]	training's binary_logloss: 0.00372338
[594]	training's binary_logloss: 0.00369763
[595]	training's binary_logloss: 0.00367219
[596]	training's binary_logloss: 0.00364397
[597]	training's binary_logloss: 0.00361821
[598]	training's binary_logloss: 0.00359339
[599]	training's binary_logloss: 0.0035683
[600]	training's binary_logloss: 0.00354372
[601]	training's binary_logloss: 0.00351886
[602]	training's binary_logloss: 0.00349407
[603]	training's binary_logloss: 0.00346782
[604]	training's binary_logloss: 0.00344381
[605]	training's binary_logloss: 0.0034181
[606]	training's binary_logloss: 0.0033936
[607]	training's binary_logloss: 0.00337161
[608]	training's binary_logloss: 0.0

[778]	training's binary_logloss: 0.000917744
[779]	training's binary_logloss: 0.000911321
[780]	training's binary_logloss: 0.000904474
[781]	training's binary_logloss: 0.000897863
[782]	training's binary_logloss: 0.000891265
[783]	training's binary_logloss: 0.000884333
[784]	training's binary_logloss: 0.000877636
[785]	training's binary_logloss: 0.000871198
[786]	training's binary_logloss: 0.000864699
[787]	training's binary_logloss: 0.000857901
[788]	training's binary_logloss: 0.000851647
[789]	training's binary_logloss: 0.000845216
[790]	training's binary_logloss: 0.000838978
[791]	training's binary_logloss: 0.000832331
[792]	training's binary_logloss: 0.000826494
[793]	training's binary_logloss: 0.000820733
[794]	training's binary_logloss: 0.000814784
[795]	training's binary_logloss: 0.000808437
[796]	training's binary_logloss: 0.000802418
[797]	training's binary_logloss: 0.000796565
[798]	training's binary_logloss: 0.000790911
[799]	training's binary_logloss: 0.000784638
[800]	trai

[966]	training's binary_logloss: 0.000237133
[967]	training's binary_logloss: 0.000235747
[968]	training's binary_logloss: 0.000234227
[969]	training's binary_logloss: 0.000232402
[970]	training's binary_logloss: 0.000231023
[971]	training's binary_logloss: 0.000229196
[972]	training's binary_logloss: 0.000227612
[973]	training's binary_logloss: 0.00022639
[974]	training's binary_logloss: 0.000225063
[975]	training's binary_logloss: 0.000223813
[976]	training's binary_logloss: 0.000222086
[977]	training's binary_logloss: 0.000220834
[978]	training's binary_logloss: 0.000219167
[979]	training's binary_logloss: 0.000217519
[980]	training's binary_logloss: 0.000216224
[981]	training's binary_logloss: 0.000214986
[982]	training's binary_logloss: 0.000213927
[983]	training's binary_logloss: 0.000212762
[984]	training's binary_logloss: 0.000211161
[985]	training's binary_logloss: 0.000209501
[986]	training's binary_logloss: 0.000207771
[987]	training's binary_logloss: 0.000206628
[988]	train

[1151]	training's binary_logloss: 7.28559e-05
[1152]	training's binary_logloss: 7.2054e-05
[1153]	training's binary_logloss: 7.12271e-05
[1154]	training's binary_logloss: 7.04427e-05
[1155]	training's binary_logloss: 6.96573e-05
[1156]	training's binary_logloss: 6.8886e-05
[1157]	training's binary_logloss: 6.81261e-05
[1158]	training's binary_logloss: 6.74114e-05
[1159]	training's binary_logloss: 6.67075e-05
[1160]	training's binary_logloss: 6.60037e-05
[1161]	training's binary_logloss: 6.53071e-05
[1162]	training's binary_logloss: 6.46151e-05
[1163]	training's binary_logloss: 6.39462e-05
[1164]	training's binary_logloss: 6.32633e-05
[1165]	training's binary_logloss: 6.25969e-05
[1166]	training's binary_logloss: 6.19462e-05
[1167]	training's binary_logloss: 6.13293e-05
[1168]	training's binary_logloss: 6.06975e-05
[1169]	training's binary_logloss: 6.00733e-05
[1170]	training's binary_logloss: 5.94693e-05
[1171]	training's binary_logloss: 5.88608e-05
[1172]	training's binary_logloss: 5.

[1338]	training's binary_logloss: 1.86435e-05
[1339]	training's binary_logloss: 1.85559e-05
[1340]	training's binary_logloss: 1.84692e-05
[1341]	training's binary_logloss: 1.83826e-05
[1342]	training's binary_logloss: 1.83039e-05
[1343]	training's binary_logloss: 1.82201e-05
[1344]	training's binary_logloss: 1.81407e-05
[1345]	training's binary_logloss: 1.80676e-05
[1346]	training's binary_logloss: 1.79917e-05
[1347]	training's binary_logloss: 1.79126e-05
[1348]	training's binary_logloss: 1.7833e-05
[1349]	training's binary_logloss: 1.77569e-05
[1350]	training's binary_logloss: 1.7679e-05
[1351]	training's binary_logloss: 1.75983e-05
[1352]	training's binary_logloss: 1.75279e-05
[1353]	training's binary_logloss: 1.74535e-05
[1354]	training's binary_logloss: 1.73776e-05
[1355]	training's binary_logloss: 1.7304e-05
[1356]	training's binary_logloss: 1.7228e-05
[1357]	training's binary_logloss: 1.71516e-05
[1358]	training's binary_logloss: 1.70838e-05
[1359]	training's binary_logloss: 1.70

[1523]	training's binary_logloss: 1.01089e-05
[1524]	training's binary_logloss: 1.00826e-05
[1525]	training's binary_logloss: 1.00551e-05
[1526]	training's binary_logloss: 1.0032e-05
[1527]	training's binary_logloss: 1.00089e-05
[1528]	training's binary_logloss: 9.98494e-06
[1529]	training's binary_logloss: 9.961e-06
[1530]	training's binary_logloss: 9.93624e-06
[1531]	training's binary_logloss: 9.91411e-06
[1532]	training's binary_logloss: 9.89048e-06
[1533]	training's binary_logloss: 9.86731e-06
[1534]	training's binary_logloss: 9.84421e-06
[1535]	training's binary_logloss: 9.82192e-06
[1536]	training's binary_logloss: 9.79646e-06
[1537]	training's binary_logloss: 9.77435e-06
[1538]	training's binary_logloss: 9.75123e-06
[1539]	training's binary_logloss: 9.72864e-06
[1540]	training's binary_logloss: 9.70609e-06
[1541]	training's binary_logloss: 9.68156e-06
[1542]	training's binary_logloss: 9.65961e-06
[1543]	training's binary_logloss: 9.63833e-06
[1544]	training's binary_logloss: 9.6

[1703]	training's binary_logloss: 7.0865e-06
[1704]	training's binary_logloss: 7.07601e-06
[1705]	training's binary_logloss: 7.06373e-06
[1706]	training's binary_logloss: 7.05289e-06
[1707]	training's binary_logloss: 7.04251e-06
[1708]	training's binary_logloss: 7.03189e-06
[1709]	training's binary_logloss: 7.02024e-06
[1710]	training's binary_logloss: 7.00908e-06
[1711]	training's binary_logloss: 6.99807e-06
[1712]	training's binary_logloss: 6.98759e-06
[1713]	training's binary_logloss: 6.97678e-06
[1714]	training's binary_logloss: 6.96561e-06
[1715]	training's binary_logloss: 6.95475e-06
[1716]	training's binary_logloss: 6.94468e-06
[1717]	training's binary_logloss: 6.93301e-06
[1718]	training's binary_logloss: 6.92261e-06
[1719]	training's binary_logloss: 6.91172e-06
[1720]	training's binary_logloss: 6.90023e-06
[1721]	training's binary_logloss: 6.8906e-06
[1722]	training's binary_logloss: 6.88054e-06
[1723]	training's binary_logloss: 6.8698e-06
[1724]	training's binary_logloss: 6.8

[1885]	training's binary_logloss: 5.52485e-06
[1886]	training's binary_logloss: 5.51804e-06
[1887]	training's binary_logloss: 5.51168e-06
[1888]	training's binary_logloss: 5.50587e-06
[1889]	training's binary_logloss: 5.49901e-06
[1890]	training's binary_logloss: 5.49267e-06
[1891]	training's binary_logloss: 5.48657e-06
[1892]	training's binary_logloss: 5.47892e-06
[1893]	training's binary_logloss: 5.47216e-06
[1894]	training's binary_logloss: 5.466e-06
[1895]	training's binary_logloss: 5.4598e-06
[1896]	training's binary_logloss: 5.45356e-06
[1897]	training's binary_logloss: 5.44636e-06
[1898]	training's binary_logloss: 5.43952e-06
[1899]	training's binary_logloss: 5.43297e-06
[1900]	training's binary_logloss: 5.42741e-06
[1901]	training's binary_logloss: 5.42125e-06
[1902]	training's binary_logloss: 5.4146e-06
[1903]	training's binary_logloss: 5.40882e-06
[1904]	training's binary_logloss: 5.40225e-06
[1905]	training's binary_logloss: 5.39626e-06
[1906]	training's binary_logloss: 5.39

### Fit the final model to the testing data

In [94]:
preprocess = DataPreprocess(label_encoder)
processed_payment_test = preprocess.preprocess_payment(payment_test)
billing_test = preprocess.initialize_billing(billing_test)
processed_billing_test = preprocess.preprocess_billing(billing_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [95]:
processed_test = preprocess.merge(processed_payment_test, processed_billing_test)

In [96]:
for col in transaction_col:
    replace_value = processed_test[processed_test[col].notna()][col].mean()
    processed_test[col] = processed_test[col].fillna(replace_value)

In [97]:
processed_test = processed_test.fillna(0)

In [98]:
processed_test.head()

Unnamed: 0_level_0,transaction_01_month,transaction_02_month,transaction_03_month,transaction_04_month,transaction_05_month,transaction_06_month,transaction_07_month,transaction_08_month,transaction_09_month,transaction_10_month,...,cash_balance_06_month,cash_balance_07_month,cash_balance_08_month,cash_balance_09_month,cash_balance_10_month,cash_balance_11_month,cash_balance_12_month,MaxDelqCycle,AvgDelqCycle,LateCount
ID_CPTE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10074849,411.0,340.26,993.92,0.0,906.38,363.0,915.54,609.5,626.85,396.93,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
10086539,0.0,556.92,832.0,642.0,661.26,1880.0,950.86,1591.0,1048.6,500.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0
10140908,214.1,88.36,200.0,206.0,239.99,160.5,418.0,428.72,202.0,163.2,...,1.03,0.0,0.0,0.0,3.06,2.04,0.0,2,1.0,12
10147994,38.11,39.52,218.4,227.9,224.7,229.69,226.0,504.29,4.24,256.2,...,17.85,55.12,28.35,0.0,0.0,0.0,0.0,0,0.0,10
10152808,420.0,1030.0,510.0,423.3,877.5,1157.44,709.0,995.0,1015.0,515.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,12


In [100]:
X_test = np.array(processed_test)

In [101]:
X_tmp = normalize(X_test[:, :-3])
X_test = np.hstack((X_tmp, X_test[:, -3:]))

In [102]:
X_test.shape

(5100, 40)

In [103]:
predict_lgb = rf.predict(X_test)
#predict_lgb = np.array([0 if i < 0.6 else 1 for i in predict_lgb])

In [104]:
processed_test['Default'] = predict_lgb

In [105]:
results = processed_test.reset_index()[['ID_CPTE', 'Default']]

In [106]:
submission = pd.read_csv('../raw_data/performance_test.csv')

In [107]:
submission.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,71424379,2014-12-01,
1,64887111,2015-12-01,
2,69431075,2014-12-01,
3,31823308,2016-12-01,
4,39407834,2012-12-01,


In [108]:
results.head()

Unnamed: 0,ID_CPTE,Default
0,10074849,0
1,10086539,0
2,10140908,0
3,10147994,0
4,10152808,0


In [109]:
submission = submission[['ID_CPTE', 'Default']].merge(results, on='ID_CPTE')

In [110]:
submission = submission[['ID_CPTE', 'Default_y']]

In [111]:
submission = submission.rename(columns={'Default_y': 'Default'})

In [112]:
submission.head()

Unnamed: 0,ID_CPTE,Default
0,71424379,0
1,64887111,0
2,69431075,0
3,31823308,0
4,39407834,0


In [113]:
submission.to_csv('submission.csv', index_label=False)

In [79]:
tmp = pd.read_csv('submission.csv')

In [80]:
tmp.head()

Unnamed: 0,ID_CPTE,Default
0,71424379,0
1,64887111,0
2,69431075,0
3,31823308,0
4,39407834,0


In [168]:
len(results)

5100

In [169]:
performance_test.head()

Unnamed: 0,ID_CPTE,PERIODID_MY,Default
0,71424379,2014-12-01,
1,64887111,2015-12-01,
2,69431075,2014-12-01,
3,31823308,2016-12-01,
4,39407834,2012-12-01,
