# Model training

In [74]:
import sys
import gc
import numpy as np
import pandas as pd
from pprint import pprint
from time import time
import time as time_m
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model.logistic import LogisticRegression
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.externals import joblib
from datetime import datetime

In [54]:
data = pd.read_csv('dataset/feature_selected_train&test.csv')
cate = pd.read_csv('dataset/feature_selected_train&test_cate_woe.csv')

In [55]:
cols = cate.columns
data[cols] = cate[cols]

In [56]:
train = data[(data['data']=='Train') & (data['target'] != -1)]
test = data[(data['data']=='Test') & (data['target'] != -1)]
train = train.drop('data',axis=1)
test = test.drop('data',axis=1)

In [62]:
del data, cate
gc.collect()

716

In [57]:
train_x = train.drop('target',axis = 1)
test_x = test.drop('target',axis = 1)
train_y = train.target.astype("int")
test_y = test.target.astype("int")

In [60]:
train_x,test_x,train_y,test_y = train_test_split(train_x,train_y, test_size=0.3,random_state=5,stratify=train_y)

In [61]:
train_x.Month_received = train_x.Month_received.apply(lambda x:time_m.mktime(time_m.strptime(x,'%Y-%m-%d')))
test_x.Month_received = test_x.Month_received.apply(lambda x:time_m.mktime(time_m.strptime(x,'%Y-%m-%d')))

In [63]:
train_y.shape

(729606,)

In [81]:
train_y.value_counts()

0    684530
1     45076
Name: target, dtype: int64

In [64]:
del train,test
gc.collect()

40

In [47]:
svm = SVC(probability=True) #class_weight=svmweight, C=svmc, kernel=svmkernel, 
rf = RandomForestClassifier(random_state=114514)
lr = LogisticRegression(solver = 'saga',random_state =114514)
xgb = XGBClassifier(seed=114514)

In [75]:
def get_param_grid(clr):
    if clr == 'svm':
        param_grid = dict(
            C = [0.5,1,2],
            kernel=['linear', 'poly', 'rbf'],
            gamma=[0.5,1,3,5]
        )
    elif clr == 'rf':
        param_grid = dict(
            n_estimators = [10,15]
        )
    elif clr == 'lr':
        param_grid = dict(
            penalty = ['none','l1','l2'],
            class_weight = ['balanced']
        )
    elif clr == 'xgb':
        param_grid = dict(
            n_estimators=[50],
            learning_rate=[0.1],
            max_depth=[10]
        )
    return param_grid
        

In [92]:
# for classifier in zip([svm, rf, lr, xgb],['svm', 'rf', 'lr', 'xgb']):
for classifier in zip([lr,rf,xgb],['lr','rf','xgb']):
    param_grid = get_param_grid(classifier[1])

    # The CV indicates it does cross_validation. By default 5-fold cross validation
    grid_search = GridSearchCV(classifier[0], param_grid=param_grid, verbose=10, n_jobs=4,scoring='roc_auc')

    print("Performing grid search...")
    print("Classifier:", [classifier[1]])
    print("Parameters and HyperParameters:")
    pprint(param_grid)

    # Run for 1 classifier. I always time these, and log the time
    t0 = time()
    if classifier[0] == xgb:
        grid_search.fit(train_x, train_y)
    else:
        cache = train_x.fillna(0)
        grid_search.fit(cache, train_y)
        del cache
        gc.collect()
    print("Grid search done in %0.3fs \n" % (time() - t0))

    # Now print the score (the actual # is not important), and best param values
    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))

    sys.stdout.flush()

    # and run the predictions with the best parameters,sanity-check again
    if classifier[0] == xgb:
        pred_y = grid_search.predict(test_x)
    else:
        cache = test_x.fillna(0)
        pred_y = grid_search.predict(cache)
        del cache
        gc.collect()

    # Now compute metrics
    print("Classification Report")
    print(classification_report(test_y, pred_y))

    print("Confusion Matrix")
    print(confusion_matrix(test_y, pred_y))
    
    print("AUC")
    fpr, tpr, thresholds = metrics.roc_curve(test_y, pred_y)
    print(metrics.auc(fpr, tpr))

    print('\n')
    sys.stdout.flush()

Performing grid search...
Classifier: ['lr']
Parameters and HyperParameters:
{'class_weight': ['balanced'], 'penalty': ['none', 'l1', 'l2']}


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=4)]: Done   3 out of   9 | elapsed:  1.2min remaining:  2.4min
[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed:  1.8min remaining:  2.2min
[Parallel(n_jobs=4)]: Done   5 out of   9 | elapsed:  2.5min remaining:  2.0min
[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed:  3.0min remaining:  1.5min
[Parallel(n_jobs=4)]: Done   7 out of   9 | elapsed:  3.0min remaining:   51.6s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:  3.5min remaining:    0.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:  3.5min finished


Grid search done in 300.343s 

Best score: 0.584
Best parameters set:
	class_weight: 'balanced'
	penalty: 'none'




Classification Report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97    293370
           1       0.16      0.00      0.00     19319

    accuracy                           0.94    312689
   macro avg       0.55      0.50      0.49    312689
weighted avg       0.89      0.94      0.91    312689

Confusion Matrix
[[293250    120]
 [ 19296     23]]
AUC
0.5003907490166988


