# Model training

In [27]:
import re
import sys
import gc
import numpy as np
import pandas as pd
from pprint import pprint
from time import time
import time as time_m
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import  FeatureUnion #Pipeline,
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
# from sklearn.externals import joblib
from datetime import datetime

In [48]:
data = pd.read_csv('dataset/feature_selected_train&test.csv')
cate = pd.read_csv('dataset/feature_selected_train&test_cate_woe.csv')
kmeans = pd.read_csv('dataset/kmeans_label.csv')[['User_id','kmeans_group','time']]

In [49]:
cols = cate.columns
data[cols] = cate[cols]

In [50]:
cols = []
with open('dataset/feature_names.txt') as f:
    for line in f.readlines():
        cols.append(line.strip()) 
# print(cols)

In [60]:
data=pd.merge(data,kmeans,left_on=['User_id','Month_received'],right_on=['User_id','time'], how='left')
data= data.drop(['time'],axis=1)

0          2016-05-01
1                 NaN
2                 NaN
3          2016-06-01
4          2016-05-01
              ...    
1166917    2016-07-01
1166918    2016-07-01
1166919    2016-07-01
1166920    2016-07-01
1166921    2016-07-01
Name: time, Length: 1166922, dtype: object

In [32]:
train = data[(data['data']=='Train') & (data['target'] != -1)]
test = data[(data['data']=='Test') & (data['target'] != -1)]
train = train.drop('data',axis=1)
test = test.drop('data',axis=1)

In [33]:
del data, cate, kmeans
gc.collect()

1522

In [34]:
train_x = train[cols]
test_x = test[cols]
# columns_to_drop = list(train_x.columns[:6])
# train_x = train_x.drop(columns_to_drop,axis = 1)
# test_x = test_x.drop(columns_to_drop, axis = 1)
train_y = train.target.astype("int")

In [35]:
train_x,test_x,train_y,test_y = train_test_split(train_x,train_y, test_size=0.3,random_state=5,stratify=train_y)

In [36]:
train_x.shape

(847315, 85)

In [37]:
train_y.value_counts()

0    792103
1     55212
Name: target, dtype: int64

In [38]:
train_x = train_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
test_x = test_x.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [39]:
del train,test
gc.collect()

102

In [40]:
# svm = SVC(probability=True) #class_weight=svmweight, C=svmc, kernel=svmkernel, 
rf = RandomForestClassifier(random_state=114514)
lr = LogisticRegression(solver = 'saga',random_state =114514)
xgb = XGBClassifier(booster='gbtree',gamma=1,seed=114514)
smote = SMOTE(random_state=114514)
ros = RandomOverSampler(sampling_strategy = 0.7,random_state=114514)
lgb = LGBMClassifier(n_jobs=-1)

In [41]:
def get_param_grid(clr):
    if clr == 'lgb':
        param_grid = dict(
            classifier__metric = ['auc', 'binary_logloss'],
            classifier__num_iterations=[50,100,150],
            classifier__seed = [114514],
            classifier__max_depth = [10, 20],
            classifier__objective = ['binary'],
            upsample__sampling_strategy = [0.1,0.7],
            classifier__num_leaves = [15,35] #be smaller than 2^(max_depth)
        )
    elif clr == 'rf':
        param_grid = dict(
            classifier__n_estimators = [10,15],
            upsample__sampling_strategy = [0.1,0.7]
        )
    elif clr == 'lr':
        param_grid = dict(
            classifier__penalty = ['none','l1','l2'],
            classifier__class_weight = ['balanced'],
            upsample__sampling_strategy = [0.1,0.7]
        )
    elif clr == 'xgb':
        param_grid = dict(
            classifier__n_estimators=[50,100,150],
            classifier__learning_rate=[0.1],
            classifier__max_depth=[5, 10, 20],
            classifier__tree_method = ['hist'],
            classifier__eval_metric = ['auc', 'binary_logloss'],
            classifier__seed = [0],
            upsample__sampling_strategy = [0.1,0.7]
        )
    return param_grid
        

In [42]:
param_grid = 0
models = []
for classifier in zip([lgb,lr,rf,xgb],['lgb','lr','rf','xgb']):
# for classifier in zip([lgb],['lgb']):
    print("Performing grid search...")
    print("Classifier:", [classifier[1]])
    param_grid = get_param_grid(classifier[1])
    pipeline = Pipeline([('upsample',ros),('classifier',classifier[0])])
    grid_search = GridSearchCV(pipeline, param_grid=param_grid, verbose=10, n_jobs=6,scoring='roc_auc')
    if classifier[0] == xgb:
        grid_search.fit(train_x, train_y)
    else:
        cache = train_x.fillna(0)
        grid_search.fit(cache, train_y)
        del cache
        gc.collect()
    models.append((grid_search.best_score_,grid_search.best_estimator_))

Performing grid search...
Classifier: ['lgb']
Fitting 5 folds for each of 48 candidates, totalling 240 fits




Performing grid search...
Classifier: ['lr']
Fitting 5 folds for each of 6 candidates, totalling 30 fits




Performing grid search...
Classifier: ['rf']
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Performing grid search...
Classifier: ['xgb']
Fitting 5 folds for each of 36 candidates, totalling 180 fits


 0.7783791  0.77896256 0.78221739 0.78027044 0.78393788 0.78000769
 0.7849263  0.76028929 0.78078002 0.75324018 0.77879416 0.75154119
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan
        nan        nan        nan        nan        nan        nan]


In [43]:
models

[(0.7740519248829528,
  Pipeline(steps=[('upsample',
                   RandomOverSampler(random_state=114514, sampling_strategy=0.7)),
                  ('classifier',
                   LGBMClassifier(max_depth=10, metric='auc', num_iterations=150,
                                  num_leaves=35, objective='binary',
                                  seed=114514))])),
 (0.6204357542571204,
  Pipeline(steps=[('upsample',
                   RandomOverSampler(random_state=114514, sampling_strategy=0.7)),
                  ('classifier',
                   LogisticRegression(class_weight='balanced',
                                      random_state=114514, solver='saga'))])),
 (0.7699390422738324,
  Pipeline(steps=[('upsample',
                   RandomOverSampler(random_state=114514, sampling_strategy=0.1)),
                  ('classifier',
                   RandomForestClassifier(n_estimators=15, random_state=114514))])),
 (0.7849262995417513,
  Pipeline(steps=[('upsample',
          

In [44]:
grid_search

GridSearchCV(estimator=Pipeline(steps=[('upsample',
                                        RandomOverSampler(random_state=114514,
                                                          sampling_strategy=0.7)),
                                       ('classifier',
                                        XGBClassifier(base_score=None,
                                                      booster='gbtree',
                                                      colsample_bylevel=None,
                                                      colsample_bynode=None,
                                                      colsample_bytree=None,
                                                      gamma=1, gpu_id=None,
                                                      importance_type='gain',
                                                      interaction_constraints=None,
                                                      learning_rate=None,
                                             

In [45]:
def get_predict(models, train_x, train_y, test_x):
    total_score = sum([i[0] for i in models])
    res = []
    for i in models:
        if 'xgb' in str(i[1]):
            print('1',i[1])
            i[1].fit(train_x, train_y)
            y = i[1].predict_proba(test_x)
            print(y)
            y = y*i[0]/total_score
            res.append(y)
        else:
            print('2' , i[1])
            cache = train_x.fillna(0)
            i[1].fit(cache, train_y)
            cache = test_x.fillna(0)
            y = i[1].predict_proba(cache)
            print(y)
            y = y*i[0]/total_score
            res.append(y)
            del cache
            gc.collect()
        print(res)
    return res

In [46]:
res = get_predict(models, train_x, train_y, test_x)

2 Pipeline(steps=[('upsample',
                 RandomOverSampler(random_state=114514, sampling_strategy=0.7)),
                ('classifier',
                 LGBMClassifier(max_depth=10, metric='auc', num_iterations=150,
                                num_leaves=35, objective='binary',
                                seed=114514))])




[[0.6539372  0.3460628 ]
 [0.6539372  0.3460628 ]
 [0.6539372  0.3460628 ]
 ...
 [0.52695281 0.47304719]
 [0.6539372  0.3460628 ]
 [0.68324023 0.31675977]]
[array([[0.17162454, 0.0908235 ],
       [0.17162454, 0.0908235 ],
       [0.17162454, 0.0908235 ],
       ...,
       [0.13829773, 0.12415031],
       [0.17162454, 0.0908235 ],
       [0.17931506, 0.08313298]])]
2 Pipeline(steps=[('upsample',
                 RandomOverSampler(random_state=114514, sampling_strategy=0.7)),
                ('classifier',
                 LogisticRegression(class_weight='balanced',
                                    random_state=114514, solver='saga'))])




[[0.50000006 0.49999994]
 [0.50000006 0.49999994]
 [0.50000006 0.49999994]
 ...
 [0.50224658 0.49775342]
 [0.50000006 0.49999994]
 [0.35415349 0.64584651]]
[array([[0.17162454, 0.0908235 ],
       [0.17162454, 0.0908235 ],
       [0.17162454, 0.0908235 ],
       ...,
       [0.13829773, 0.12415031],
       [0.17162454, 0.0908235 ],
       [0.17931506, 0.08313298]]), array([[0.10518168, 0.10518166],
       [0.10518168, 0.10518166],
       [0.10518168, 0.10518166],
       ...,
       [0.10565427, 0.10470907],
       [0.10518168, 0.10518166],
       [0.07450091, 0.13586243]])]
2 Pipeline(steps=[('upsample',
                 RandomOverSampler(random_state=114514, sampling_strategy=0.1)),
                ('classifier',
                 RandomForestClassifier(n_estimators=15, random_state=114514))])
[[0.92974728 0.07025272]
 [0.92974728 0.07025272]
 [0.92974728 0.07025272]
 ...
 [1.         0.        ]
 [0.92974728 0.07025272]
 [0.6        0.4       ]]
[array([[0.17162454, 0.0908235 ],
     



[[0.92725354 0.07274644]
 [0.92725354 0.07274644]
 [0.92725354 0.07274644]
 ...
 [0.8944893  0.10551071]
 [0.92725354 0.07274644]
 [0.96165735 0.03834264]]
[array([[0.17162454, 0.0908235 ],
       [0.17162454, 0.0908235 ],
       [0.17162454, 0.0908235 ],
       ...,
       [0.13829773, 0.12415031],
       [0.17162454, 0.0908235 ],
       [0.17931506, 0.08313298]]), array([[0.10518168, 0.10518166],
       [0.10518168, 0.10518166],
       [0.10518168, 0.10518166],
       ...,
       [0.10565427, 0.10470907],
       [0.10518168, 0.10518166],
       [0.07450091, 0.13586243]]), array([[0.24271382, 0.01833972],
       [0.24271382, 0.01833972],
       [0.24271382, 0.01833972],
       ...,
       [0.26105354, 0.        ],
       [0.24271382, 0.01833972],
       [0.15663212, 0.10442142]]), array([[0.2467747 , 0.01936038],
       [0.2467747 , 0.01936038],
       [0.2467747 , 0.01936038],
       ...,
       [0.23805498, 0.0280801 ],
       [0.2467747 , 0.01936038],
       [0.25593075, 0.01020432

In [47]:
([i[0] for i in models])

[0.7740519248829528,
 0.6204357542571204,
 0.7699390422738324,
 0.7849262995417513]