In [21]:
import pandas as pd
import numpy as np
import scipy
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
from lightgbm import LGBMClassifier 
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, BaseEnsemble, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import eli5
import shap
from sklearn import model_selection

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [2]:
data = pd.read_csv('../data/data_full_categorical_1705.csv')

In [3]:
cnt_cols = []
cat_num_cols = []
cat_cols = []
for col in data.columns:
    if data[col].dtype == object:
        cat_cols.append(col)
    else:
        if col != 'RESULT' and 'ordinal' not in col:
            cnt_cols.append(col)
        else:
            cat_num_cols.append(col)

In [4]:
cat_cols

In [5]:
cat_num_cols

In [6]:
cnt_cols

In [7]:
cat_cols.remove('Unique_ID')
cat_num_cols.remove('RESULT')

In [8]:
train = pd.read_csv('../data/CAX_MortgageModeling_Train.csv')
train_len = train.shape[0]

In [9]:
train = data.iloc[:train_len]
test = data.iloc[train_len:]

In [10]:
train2 = pd.concat([train[train.RESULT == 0].sample(frac=.3,random_state=123),train[train.RESULT == 1]],axis=0).sample(frac=1,random_state=123)

In [11]:
kf = KFold(n_splits=5,random_state=123,shuffle=True)
for train_index, test_index in kf.split(train):
    break
for train_index2, test_index2 in kf.split(train2):
    break

In [12]:
rf = RandomForestClassifier(n_estimators=1000)
rf.fit(train2[cnt_cols+cat_num_cols].iloc[train_index2], train2.RESULT.iloc[train_index2])
pred = rf.predict(train2[cnt_cols+cat_num_cols].iloc[test_index2])
print (accuracy_score(train2.RESULT.iloc[test_index2],pred))
print (confusion_matrix(train2.RESULT.iloc[test_index2],pred))
print (f1_score(train2.RESULT.iloc[test_index2],pred))

0.6297112090063632
[[1478  688]
 [ 825 1095]]
0.5914123683499865


In [18]:
rf = RandomForestClassifier(n_estimators=10000,class_weight={0:1,1:3},max_depth=6)
rf.fit(train[cnt_cols+cat_num_cols].iloc[train_index], train.RESULT.iloc[train_index])
pred = rf.predict(train[cnt_cols+cat_num_cols].iloc[test_index])
print (accuracy_score(train.RESULT.iloc[test_index],pred))
print (confusion_matrix(train.RESULT.iloc[test_index],pred))
print (f1_score(train.RESULT.iloc[test_index],pred))

pred_proba = rf.predict_proba(train[cnt_cols+cat_num_cols].iloc[test_index])[:,1]
cutoffs = []
f1s = []
for cutoff in np.arange(.1,.5,.01):
    cutoffs.append(cutoff)
    f1s.append(f1_score(train.RESULT.iloc[test_index],pred_proba>cutoff))
print (max(f1s),cutoffs[np.array(f1s).argmax()])   
print (accuracy_score(train.RESULT.iloc[test_index],pred_proba>cutoffs[np.array(f1s).argmax()]))
print (confusion_matrix(train.RESULT.iloc[test_index],pred_proba>cutoffs[np.array(f1s).argmax()]))

0.7315149523496549
[[5982 1252]
 [1199  696]]
0.3622170179547229
0.4019017432646593 0.4199999999999998
0.5865921787709497
[[4087 3147]
 [ 627 1268]]


### Hyperparameter optimization

In [19]:
from bayes_opt import BayesianOptimization

In [41]:
def train_rf(max_depth,n_estimators,max_features,colsample_bytree,learning_rate):
    params = {
        'max_depth': int(max_depth),
        'n_estimators': int(n_estimators),
        'max_features': max_features
    }
    clf = RandomForestClassifier(**params, random_state=42)
    clf.fit(train2[cnt_cols+cat_num_cols].iloc[train_index2], train2.RESULT.iloc[train_index2])
    pred = clf.predict(train2[cnt_cols+cat_num_cols].iloc[test_index2])
    score = f1_score(train2.RESULT.iloc[test_index2],pred)
    print("F1 {:.3f} params {}".format(score, params))
    return score

In [42]:
def train_gbm(max_depth,n_estimators,max_features,colsample_bytree,learning_rate):
    params = {
        'max_depth': int(max_depth),
        'n_estimators': int(n_estimators),
        'max_features': max_features,
        'learning_rate': learning_rate
    }
    clf = GradientBoostingClassifier(**params, random_state=42)
    clf.fit(train2[cnt_cols+cat_num_cols].iloc[train_index2], train2.RESULT.iloc[train_index2])
    pred = clf.predict(train2[cnt_cols+cat_num_cols].iloc[test_index2])
    score = f1_score(train2.RESULT.iloc[test_index2],pred)
    print("F1 {:.3f} params {}".format(score, params))
    return score

In [50]:
def train_xgb(max_depth,n_estimators,max_features,colsample_bytree,learning_rate):
    params = {
        'max_depth': int(max_depth),
        'n_estimators': int(n_estimators),
        'colsample_bytree': colsample_bytree,
        'learning_rate': learning_rate
    }
    clf = XGBClassifier(**params, random_state=42)
    clf.fit(train2[cnt_cols+cat_num_cols].iloc[train_index2], train2.RESULT.iloc[train_index2])
    pred = clf.predict(train2[cnt_cols+cat_num_cols].iloc[test_index2])
    score = f1_score(train2.RESULT.iloc[test_index2],pred)
    print("F1 {:.3f} params {}".format(score, params))
    return score

In [51]:
def train_lgb(max_depth,n_estimators,max_features,colsample_bytree,learning_rate):
    params = {
        'max_depth': int(max_depth),
        'n_estimators': int(n_estimators),
        'colsample_bytree': colsample_bytree,
        'learning_rate': learning_rate
    }
    clf = LGBMClassifier(**params, random_state=42)
    clf.fit(train2[cnt_cols+cat_num_cols].iloc[train_index2], train2.RESULT.iloc[train_index2]
    pred = clf.predict(train2[cnt_cols+cat_num_cols].iloc[test_index2])
    score = f1_score(train2.RESULT.iloc[test_index2],pred)
    print("F1 {:.3f} params {}".format(score, params))
    return score

In [44]:
bounds = {
    'max_depth':(5,20),
    'n_estimators': (100,5000),
    'max_features':(.5,1),
    'colsample_bytree':(.5,1),
    'learning_rate':(.05,.15)
}

In [43]:
optimizer = BayesianOptimization(
    f=train_rf,
    pbounds=bounds,
    random_state=1,
)
optimizer.maximize(init_points=10, n_iter=20)

|   iter    |  target   | colsam... | learni... | max_depth | max_fe... | n_esti... |
-------------------------------------------------------------------------------------
F1 0.458 params {'max_depth': 2, 'n_estimators': 819, 'max_features': 0.6511662863159199}
|  1        |  0.4581   |  0.7085   |  0.122    |  2.001    |  0.6512   |  819.1    |
F1 0.572 params {'max_depth': 4, 'n_estimators': 2740, 'max_features': 0.698383737115335}
|  2        |  0.5721   |  0.5462   |  0.06863  |  4.764    |  0.6984   |  2.74e+03 |
F1 0.561 params {'max_depth': 3, 'n_estimators': 234, 'max_features': 0.9390587181954727}
|  3        |  0.5609   |  0.7096   |  0.1185   |  3.636    |  0.9391   |  234.2    |
F1 0.567 params {'max_depth': 6, 'n_estimators': 1070, 'max_features': 0.5701934692976169}
|  4        |  0.5667   |  0.8352   |  0.09173  |  6.47     |  0.5702   |  1.071e+0 |
F1 0.569 params {'max_depth': 4, 'n_estimators': 4394, 'max_features': 0.846161307834657}
|  5        |  0.5695   |  0.9004

KeyboardInterrupt: 

In [52]:
optimizer = BayesianOptimization(
    f=train_lgb,
    pbounds=bounds,
    random_state=1,
)
optimizer.maximize(init_points=10, n_iter=20)

|   iter    |  target   | colsam... | learni... | max_depth | max_fe... | n_esti... |
-------------------------------------------------------------------------------------
F1 0.608 params {'max_depth': 5, 'n_estimators': 819, 'colsample_bytree': 0.7085110023512871, 'learning_rate': 0.12203244934421581}
|  1        |  0.6077   |  0.7085   |  0.122    |  5.002    |  0.6512   |  819.1    |
F1 0.604 params {'max_depth': 10, 'n_estimators': 2740, 'colsample_bytree': 0.5461692973843989, 'learning_rate': 0.06862602113776708}
|  2        |  0.6041   |  0.5462   |  0.06863  |  10.18    |  0.6984   |  2.74e+03 |
F1 0.607 params {'max_depth': 8, 'n_estimators': 234, 'colsample_bytree': 0.7095972572016473, 'learning_rate': 0.11852195003967594}
|  3        |  0.6069   |  0.7096   |  0.1185   |  8.067    |  0.9391   |  234.2    |
F1 0.606 params {'max_depth': 13, 'n_estimators': 1070, 'colsample_bytree': 0.8352337550892011, 'learning_rate': 0.0917304802367127}
|  4        |  0.6065   |  0.8352   |  

KeyboardInterrupt: 

In [53]:
optimizer = BayesianOptimization(
    f=train_xgb,
    pbounds=bounds,
    random_state=1,
)
optimizer.maximize(init_points=10, n_iter=20)

|   iter    |  target   | colsam... | learni... | max_depth | max_fe... | n_esti... |
-------------------------------------------------------------------------------------
F1 0.611 params {'max_depth': 5, 'n_estimators': 819, 'colsample_bytree': 0.7085110023512871, 'learning_rate': 0.12203244934421581}
|  1        |  0.6107   |  0.7085   |  0.122    |  5.002    |  0.6512   |  819.1    |
F1 0.599 params {'max_depth': 10, 'n_estimators': 2740, 'colsample_bytree': 0.5461692973843989, 'learning_rate': 0.06862602113776708}
|  2        |  0.599    |  0.5462   |  0.06863  |  10.18    |  0.6984   |  2.74e+03 |
F1 0.613 params {'max_depth': 8, 'n_estimators': 234, 'colsample_bytree': 0.7095972572016473, 'learning_rate': 0.11852195003967594}
|  3        |  0.6131   |  0.7096   |  0.1185   |  8.067    |  0.9391   |  234.2    |
F1 0.598 params {'max_depth': 13, 'n_estimators': 1070, 'colsample_bytree': 0.8352337550892011, 'learning_rate': 0.0917304802367127}
|  4        |  0.5977   |  0.8352   |  

KeyboardInterrupt: 

In [None]:
optimizer = BayesianOptimization(
    f=train_gbm,
    pbounds=bounds,
    random_state=1,
)
optimizer.maximize(init_points=10, n_iter=20)