In [2]:
import pandas as pd
import numpy as np
import scipy
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, confusion_matrix, precision_score, recall_score
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, BaseEnsemble, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier
import eli5
import shap
from sklearn import model_selection

pd.options.display.max_columns = 1000
pd.options.display.max_rows = 1000

In [2]:
train = pd.read_csv('../data/CAX_MortgageModeling_Train.csv')

In [3]:
train.RESULT = train.RESULT.apply(lambda x: 1 if x == 'FUNDED' else 0)
train.RESULT = train.RESULT.apply(lambda x: 1-x)

In [4]:
test = pd.read_csv('../data/CAX_MortgageModeling_Test.csv')

In [5]:
train_len = train.shape[0]
traincols = list(train.columns[2:-1])

In [6]:
data = pd.concat([train[['Unique_ID'] + traincols+['RESULT']],test[['Unique_ID']+traincols+['RESULT']]],axis=0)

In [7]:
data['GDS'] = np.abs(data['GDS'])
data['GDS'] = np.clip(data.GDS,0,data['GDS'].max())

In [8]:
data['TDS'] = np.abs(data['TDS'])
data['TDS'] = np.clip(data.TDS,0,data['TDS'].max())

In [9]:
data['GDS'] = data['GDS']/100
data['LTV'] = data['LTV']/100
data['TDS'] = data['TDS']/100
data['RATE'] = data['RATE']/100

In [10]:
data['APPRAISED PROPERTY VALUE'] = data['MORTGAGE AMOUNT']/data['LTV']
data['CHANGE IN PROPERTY VALUE'] = (data['APPRAISED PROPERTY VALUE']-data['PROPERTY VALUE'])/data['PROPERTY VALUE']
data['ANNUALIZED HOUSING EXPENSE'] = data['GDS'] * data['INCOME']
data['MORTGAGE RATIO'] = data['MORTGAGE AMOUNT']/data['PROPERTY VALUE']
data['OTHER EXPENSE'] = data['INCOME'] * data['TDS'] - data['ANNUALIZED HOUSING EXPENSE']
data['RENT INCOME'] = data.apply(lambda x: np.abs(x['OTHER EXPENSE']) if x.GDS > x.TDS else 0, axis=1)
data['OTHER EXPENSE'] = np.clip(data['OTHER EXPENSE'],0,data['OTHER EXPENSE'].max())

In [11]:
def f(x):
    if x['PAYMENT FREQUENCY'] in ['Bi-Weekly','Bi-Weekly Accelerated','Semi-Monthly'] :
        return 2
    elif x['PAYMENT FREQUENCY'] in ['Weekly','Weekly Accelerated']:
        return 4
    else:
        return x['PAYMENT FREQUENCY IN MONTHS']
data['PAYMENT FREQUENCY IN MONTHS'] = 1
data['PAYMENT FREQUENCY IN MONTHS'] = data.apply(lambda x: f(x), axis=1)

In [12]:
data['MORTGAGE PAYMENT'] = data['PAYMENT FREQUENCY IN MONTHS'] * data['MORTGAGE PAYMENT']

In [13]:
data['ACTUAL MORTGAGE PAYMENT'] = data.apply(lambda x: x['MORTGAGE AMOUNT'] * np.exp(x.RATE*x.AMORTIZATION*1.0/12), axis=1)
data['TOTAL INTEREST'] = data.apply(lambda x: x['ACTUAL MORTGAGE PAYMENT'] - x['MORTGAGE AMOUNT'], axis=1)

In [14]:
data['MORTGAGE AMOUNT PER INCOME'] = data['MORTGAGE AMOUNT']/data['INCOME']
data['MORTGAGE PAYMENT PER INCOME'] = data['MORTGAGE PAYMENT']*12/data['INCOME']
data['PROPERTY VALUE PER INCOME'] = data['PROPERTY VALUE']/data['INCOME']
data['TOTAL INTEREST PER INCOME'] = data['TOTAL INTEREST']/data['INCOME']
data['OTHER EXPENSE PER INCOME'] = data['OTHER EXPENSE']/data['INCOME']
data['ANNUALIZED HOUSING EXPENSE PER INCOME'] = data['ANNUALIZED HOUSING EXPENSE']/data['INCOME']

In [15]:
data['ACTUAL MORTGAGE PAYMENT BY MORTGAGE'] = data['ACTUAL MORTGAGE PAYMENT']/data['MORTGAGE AMOUNT']

In [16]:
cnt_cols = []
cat_cols = []
for col in data.columns:
    if data[col].dtype != object and col != 'RESULT':
        cnt_cols.append(col)
    else:
        cat_cols.append(col)

In [17]:
data['FSA'] = data['FSA'].apply(lambda x: str(x[:2]))

In [18]:
data.replace('Under 25',25,inplace=True)
data.replace('25-29',29,inplace=True)
data.replace('30-34',34,inplace=True)
data.replace('35-39',39,inplace=True)
data.replace('40-44',44,inplace=True)
data.replace('45-49',49,inplace=True)
data.replace('50-54',54,inplace=True)
data.replace('55-59',59,inplace=True)
data.replace('60-64',64,inplace=True)
data.replace('65-69',69,inplace=True)
data.replace('70 and over',75,inplace=True)

In [19]:
data['Is unemployed'] = data['AGE RANGE'].apply(lambda x: 1 if x == 25 else 0)
data['Is recently married'] = data['AGE RANGE'].apply(lambda x: 1 if x > 25 and x <= 34 else 0)
data['Is married'] = data['AGE RANGE'].apply(lambda x: 1 if x > 34 else 0)
data['Is established'] = data['AGE RANGE'].apply(lambda x: 1 if x >= 39 else 0)
data['Is about to retire'] = data['AGE RANGE'].apply(lambda x: 1 if x >= 55 else 0)
data['Is retired'] = data['AGE RANGE'].apply(lambda x: 1 if x >= 64 else 0)

In [20]:
data['REPAYMENT AGE'] = data.AMORTIZATION/12 + data['AGE RANGE']

In [21]:
for i in ['AGE RANGE','GENDER','FSA','NAICS CODE']:
    col = i + '_' + 'MORTGAGE PURPOSE' + '_' + 'PROPERTY TYPE'
    data[col] = data.apply(lambda x: str(x[i]) + '_' + str(x['MORTGAGE PURPOSE']) + '_' + str(x['PROPERTY TYPE']), axis=1)
    cat_cols.append(col)

In [22]:
data.replace(np.nan,0,inplace=True)
data.replace(np.inf,0,inplace=True)

In [23]:
#train = data.iloc[:train_len]
#test = data.iloc[train_len:]

In [24]:
#train2 = pd.concat([train[train.RESULT == 0].sample(frac=.3),train[train.RESULT == 1]],axis=0).sample(frac=1)

In [25]:
#kf = KFold(n_splits=5,random_state=123,shuffle=True)
#for train_index2, test_index2 in kf.split(train2):
#    break

In [26]:
'''
#dt = DecisionTreeClassifier(max_depth=3,random_state=123)
#dt.fit(train2[cnt_cols].iloc[train_index2],train2.RESULT.iloc[train_index2])
#pred = dt.predict(train2[cnt_cols].iloc[test_index2])
#print (accuracy_score(train2.RESULT.iloc[test_index2],pred))
#print (confusion_matrix(train2.RESULT.iloc[test_index2],pred))
#print (f1_score(train2.RESULT.iloc[test_index2],pred))

dot_data = StringIO()
export_graphviz(dt, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

from sklearn.tree import _tree
from sklearn.utils import check_array
def tree_to_code(tree, feature_names):

    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    def recurse(node, depth):
        indent = "  " * depth
        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            print ("{}if {} <= {}:".format(indent, name, threshold))
            recurse(tree_.children_left[node], depth + 1)
            print ("{}else:  # if {} > {}".format(indent, name, threshold))
            recurse(tree_.children_right[node], depth + 1)
        else:
            print ("{}return {}".format(indent, tree_.value[node]))

    recurse(0, 1)

#print ("Decision rules: ")
#tree_to_code(dt,feature_names=cnt_cols)
'''

'\n#dt = DecisionTreeClassifier(max_depth=3,random_state=123)\n#dt.fit(train2[cnt_cols].iloc[train_index2],train2.RESULT.iloc[train_index2])\n#pred = dt.predict(train2[cnt_cols].iloc[test_index2])\n#print (accuracy_score(train2.RESULT.iloc[test_index2],pred))\n#print (confusion_matrix(train2.RESULT.iloc[test_index2],pred))\n#print (f1_score(train2.RESULT.iloc[test_index2],pred))\n\ndot_data = StringIO()\nexport_graphviz(dt, out_file=dot_data,  \n                filled=True, rounded=True,\n                special_characters=True)\ngraph = pydotplus.graph_from_dot_data(dot_data.getvalue())  \nImage(graph.create_png())\n\nfrom sklearn.tree import _tree\nfrom sklearn.utils import check_array\ndef tree_to_code(tree, feature_names):\n\n    tree_ = tree.tree_\n    feature_name = [\n        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"\n        for i in tree_.feature\n    ]\n\n    def recurse(node, depth):\n        indent = "  " * depth\n        if tree_.feature[node] != _tre

In [27]:
cnt_cols = []
cat_cols = []
for col in data.columns:
    if data[col].dtype != object and col != 'RESULT':
        cnt_cols.append(col)
    else:
        cat_cols.append(col)

In [48]:
for i in cnt_cols :
    if data[i].nunique() > 10 and 'Z SCORE' not in i:
        data[i+"_ordinal"] = data[i].rank(pct=True)*10
        data[i+"_ordinal"] = data[i+"_ordinal"].astype(int)
        cat_cols.append(i+"_ordinal")

In [49]:
cnt_cols = []
cat_cols = []
for col in data.columns:
    if data[col].dtype != object and col != 'RESULT' and 'ordinal' not in col and data[col].nunique() > 10:
        cnt_cols.append(col)
    elif 'Z SCORE' not in col:
        cat_cols.append(col)

In [50]:
cat_cols

In [51]:
cnt_cols

In [52]:
data.to_csv('../data/data_full_categorical_1705.csv',index=False)

In [53]:
train = data.iloc[:train_len]
test = data.iloc[train_len:]

In [54]:
cat_cols.remove('Unique_ID')
cat_cols.remove('RESULT')

In [55]:
train2 = pd.concat([train[train.RESULT == 0].sample(frac=.3),train[train.RESULT == 1]],axis=0).sample(frac=1)

In [56]:
train2.shape

(20426, 75)

In [57]:
kf = KFold(n_splits=5,random_state=123,shuffle=True)
for train_index, test_index in kf.split(train):
    break

In [58]:
model = CatBoostClassifier(iterations=5000,learning_rate=0.15, loss_function= 'Logloss', eval_metric='F1',use_best_model=True,random_seed=42)

In [59]:
#cat_cols = ['MORTGAGE PURPOSE', 'PAYMENT FREQUENCY', 'PROPERTY TYPE', 'TERM', 'FSA', 'AGE RANGE', 'GENDER', 'INCOME TYPE', 'NAICS CODE','AGE RANGE_MORTGAGE PURPOSE_PROPERTY TYPE', 'GENDER_MORTGAGE PURPOSE_PROPERTY TYPE', 'FSA_MORTGAGE PURPOSE_PROPERTY TYPE', 'NAICS CODE_MORTGAGE PURPOSE_PROPERTY TYPE']

In [60]:
for train_index2, test_index2 in kf.split(train2):
    break

In [None]:
model.fit(train2[cat_cols+cnt_cols].iloc[train_index2],train2.RESULT.iloc[train_index2],cat_features=np.arange(len(cat_cols)),eval_set=(train2[cat_cols+cnt_cols].iloc[test_index2],train2.RESULT.iloc[test_index2]))

0:	learn: 0.6142483	test: 0.5978443	best: 0.5978443 (0)	total: 267ms	remaining: 22m 12s
1:	learn: 0.5779993	test: 0.5686539	best: 0.5978443 (0)	total: 755ms	remaining: 31m 26s
2:	learn: 0.6028752	test: 0.5884211	best: 0.5978443 (0)	total: 1.23s	remaining: 34m 3s
3:	learn: 0.6053256	test: 0.5942221	best: 0.5978443 (0)	total: 1.78s	remaining: 37m 9s
4:	learn: 0.6054344	test: 0.5825190	best: 0.5978443 (0)	total: 2.22s	remaining: 36m 56s
5:	learn: 0.6074703	test: 0.5794802	best: 0.5978443 (0)	total: 2.44s	remaining: 33m 50s
6:	learn: 0.6065176	test: 0.5821955	best: 0.5978443 (0)	total: 2.97s	remaining: 35m 15s
7:	learn: 0.6108314	test: 0.5909592	best: 0.5978443 (0)	total: 3.41s	remaining: 35m 30s
8:	learn: 0.6100902	test: 0.5896213	best: 0.5978443 (0)	total: 3.83s	remaining: 35m 27s
9:	learn: 0.6134431	test: 0.5923034	best: 0.5978443 (0)	total: 4.29s	remaining: 35m 40s
10:	learn: 0.6153109	test: 0.5960155	best: 0.5978443 (0)	total: 4.71s	remaining: 35m 35s
11:	learn: 0.6179084	test: 0.5983

92:	learn: 0.6560861	test: 0.6181003	best: 0.6251744 (48)	total: 30.3s	remaining: 26m 40s
93:	learn: 0.6567673	test: 0.6177130	best: 0.6251744 (48)	total: 30.8s	remaining: 26m 45s
94:	learn: 0.6568935	test: 0.6188341	best: 0.6251744 (48)	total: 31.2s	remaining: 26m 48s
95:	learn: 0.6569383	test: 0.6188341	best: 0.6251744 (48)	total: 31.5s	remaining: 26m 50s
96:	learn: 0.6567510	test: 0.6197420	best: 0.6251744 (48)	total: 31.9s	remaining: 26m 51s
97:	learn: 0.6571156	test: 0.6204769	best: 0.6251744 (48)	total: 32.3s	remaining: 26m 57s
98:	learn: 0.6575268	test: 0.6203418	best: 0.6251744 (48)	total: 32.8s	remaining: 27m 2s
99:	learn: 0.6573026	test: 0.6204380	best: 0.6251744 (48)	total: 33.1s	remaining: 27m 3s
100:	learn: 0.6581191	test: 0.6212121	best: 0.6251744 (48)	total: 33.7s	remaining: 27m 15s
101:	learn: 0.6587578	test: 0.6199158	best: 0.6251744 (48)	total: 34.1s	remaining: 27m 18s
102:	learn: 0.6580724	test: 0.6228316	best: 0.6251744 (48)	total: 34.5s	remaining: 27m 19s
103:	lear

183:	learn: 0.6804363	test: 0.6250000	best: 0.6281872 (160)	total: 1m 5s	remaining: 28m 41s
184:	learn: 0.6807061	test: 0.6238429	best: 0.6281872 (160)	total: 1m 6s	remaining: 28m 43s
185:	learn: 0.6810686	test: 0.6241931	best: 0.6281872 (160)	total: 1m 6s	remaining: 28m 43s
186:	learn: 0.6813846	test: 0.6239820	best: 0.6281872 (160)	total: 1m 6s	remaining: 28m 44s
187:	learn: 0.6812917	test: 0.6254913	best: 0.6281872 (160)	total: 1m 7s	remaining: 28m 48s
188:	learn: 0.6811733	test: 0.6258771	best: 0.6281872 (160)	total: 1m 7s	remaining: 28m 43s
189:	learn: 0.6816786	test: 0.6262626	best: 0.6281872 (160)	total: 1m 8s	remaining: 28m 44s
190:	learn: 0.6819981	test: 0.6262626	best: 0.6281872 (160)	total: 1m 8s	remaining: 28m 43s
191:	learn: 0.6820880	test: 0.6266480	best: 0.6281872 (160)	total: 1m 8s	remaining: 28m 45s
192:	learn: 0.6823144	test: 0.6266480	best: 0.6281872 (160)	total: 1m 9s	remaining: 28m 46s
193:	learn: 0.6828137	test: 0.6258771	best: 0.6281872 (160)	total: 1m 9s	remaini

272:	learn: 0.6958118	test: 0.6266892	best: 0.6281872 (160)	total: 1m 37s	remaining: 28m 13s
273:	learn: 0.6955575	test: 0.6270423	best: 0.6281872 (160)	total: 1m 38s	remaining: 28m 13s
274:	learn: 0.6955575	test: 0.6270423	best: 0.6281872 (160)	total: 1m 38s	remaining: 28m 13s
275:	learn: 0.6966522	test: 0.6267229	best: 0.6281872 (160)	total: 1m 39s	remaining: 28m 14s
276:	learn: 0.6971429	test: 0.6265128	best: 0.6281872 (160)	total: 1m 39s	remaining: 28m 14s
277:	learn: 0.6968357	test: 0.6263025	best: 0.6281872 (160)	total: 1m 39s	remaining: 28m 14s
278:	learn: 0.6968357	test: 0.6259155	best: 0.6281872 (160)	total: 1m 40s	remaining: 28m 13s
279:	learn: 0.6970130	test: 0.6263025	best: 0.6281872 (160)	total: 1m 40s	remaining: 28m 12s
280:	learn: 0.6967347	test: 0.6270423	best: 0.6281872 (160)	total: 1m 40s	remaining: 28m 12s
281:	learn: 0.6967821	test: 0.6270423	best: 0.6281872 (160)	total: 1m 41s	remaining: 28m 11s
282:	learn: 0.6967882	test: 0.6268657	best: 0.6281872 (160)	total: 1m 

361:	learn: 0.7049548	test: 0.6340915	best: 0.6342697 (360)	total: 2m 11s	remaining: 28m 1s
362:	learn: 0.7049147	test: 0.6339135	best: 0.6342697 (360)	total: 2m 11s	remaining: 28m
363:	learn: 0.7049459	test: 0.6341463	best: 0.6342697 (360)	total: 2m 11s	remaining: 28m
364:	learn: 0.7049459	test: 0.6341463	best: 0.6342697 (360)	total: 2m 12s	remaining: 27m 59s
365:	learn: 0.7049459	test: 0.6341463	best: 0.6342697 (360)	total: 2m 12s	remaining: 27m 59s
366:	learn: 0.7048099	test: 0.6343242	best: 0.6343242 (366)	total: 2m 13s	remaining: 27m 59s
367:	learn: 0.7048099	test: 0.6343242	best: 0.6343242 (366)	total: 2m 13s	remaining: 27m 59s
368:	learn: 0.7051779	test: 0.6341463	best: 0.6343242 (366)	total: 2m 13s	remaining: 27m 59s
369:	learn: 0.7051779	test: 0.6341463	best: 0.6343242 (366)	total: 2m 14s	remaining: 27m 58s
370:	learn: 0.7051779	test: 0.6341463	best: 0.6343242 (366)	total: 2m 14s	remaining: 27m 58s
371:	learn: 0.7052259	test: 0.6341463	best: 0.6343242 (366)	total: 2m 14s	remai

450:	learn: 0.7136749	test: 0.6325503	best: 0.6343514 (375)	total: 2m 43s	remaining: 27m 33s
451:	learn: 0.7138299	test: 0.6332867	best: 0.6343514 (375)	total: 2m 44s	remaining: 27m 33s
452:	learn: 0.7138299	test: 0.6332867	best: 0.6343514 (375)	total: 2m 44s	remaining: 27m 33s
453:	learn: 0.7141499	test: 0.6325503	best: 0.6343514 (375)	total: 2m 45s	remaining: 27m 33s
454:	learn: 0.7141596	test: 0.6330814	best: 0.6343514 (375)	total: 2m 45s	remaining: 27m 34s
455:	learn: 0.7143051	test: 0.6328759	best: 0.6343514 (375)	total: 2m 46s	remaining: 27m 34s
456:	learn: 0.7149936	test: 0.6319911	best: 0.6343514 (375)	total: 2m 46s	remaining: 27m 35s
457:	learn: 0.7150906	test: 0.6319911	best: 0.6343514 (375)	total: 2m 46s	remaining: 27m 35s
458:	learn: 0.7149936	test: 0.6319911	best: 0.6343514 (375)	total: 2m 47s	remaining: 27m 36s
459:	learn: 0.7149936	test: 0.6319911	best: 0.6343514 (375)	total: 2m 47s	remaining: 27m 36s
460:	learn: 0.7149936	test: 0.6319911	best: 0.6343514 (375)	total: 2m 

539:	learn: 0.7199186	test: 0.6321678	best: 0.6343514 (375)	total: 3m 15s	remaining: 26m 57s
540:	learn: 0.7199186	test: 0.6321678	best: 0.6343514 (375)	total: 3m 16s	remaining: 26m 56s
541:	learn: 0.7199186	test: 0.6321678	best: 0.6343514 (375)	total: 3m 16s	remaining: 26m 55s
542:	learn: 0.7200543	test: 0.6308123	best: 0.6343514 (375)	total: 3m 16s	remaining: 26m 54s
543:	learn: 0.7200543	test: 0.6308123	best: 0.6343514 (375)	total: 3m 16s	remaining: 26m 53s
544:	learn: 0.7200543	test: 0.6308123	best: 0.6343514 (375)	total: 3m 17s	remaining: 26m 52s
545:	learn: 0.7208576	test: 0.6315199	best: 0.6343514 (375)	total: 3m 17s	remaining: 26m 51s
546:	learn: 0.7207708	test: 0.6313429	best: 0.6343514 (375)	total: 3m 17s	remaining: 26m 50s
547:	learn: 0.7207708	test: 0.6315199	best: 0.6343514 (375)	total: 3m 18s	remaining: 26m 49s
548:	learn: 0.7207708	test: 0.6315199	best: 0.6343514 (375)	total: 3m 18s	remaining: 26m 48s
549:	learn: 0.7216229	test: 0.6330532	best: 0.6343514 (375)	total: 3m 

628:	learn: 0.7268167	test: 0.6300530	best: 0.6343514 (375)	total: 3m 42s	remaining: 25m 47s
629:	learn: 0.7265206	test: 0.6301982	best: 0.6343514 (375)	total: 3m 42s	remaining: 25m 46s
630:	learn: 0.7264957	test: 0.6305198	best: 0.6343514 (375)	total: 3m 43s	remaining: 25m 46s
631:	learn: 0.7272851	test: 0.6310788	best: 0.6343514 (375)	total: 3m 43s	remaining: 25m 45s
632:	learn: 0.7276426	test: 0.6310788	best: 0.6343514 (375)	total: 3m 43s	remaining: 25m 44s
633:	learn: 0.7275563	test: 0.6310788	best: 0.6343514 (375)	total: 3m 44s	remaining: 25m 43s
634:	learn: 0.7276426	test: 0.6310788	best: 0.6343514 (375)	total: 3m 44s	remaining: 25m 42s
635:	learn: 0.7276426	test: 0.6310788	best: 0.6343514 (375)	total: 3m 44s	remaining: 25m 42s
636:	learn: 0.7275563	test: 0.6310788	best: 0.6343514 (375)	total: 3m 45s	remaining: 25m 41s
637:	learn: 0.7277906	test: 0.6320201	best: 0.6343514 (375)	total: 3m 45s	remaining: 25m 40s
638:	learn: 0.7277906	test: 0.6320201	best: 0.6343514 (375)	total: 3m 

717:	learn: 0.7300855	test: 0.6301370	best: 0.6343514 (375)	total: 4m 9s	remaining: 24m 46s
718:	learn: 0.7299498	test: 0.6301370	best: 0.6343514 (375)	total: 4m 9s	remaining: 24m 45s
719:	learn: 0.7299498	test: 0.6301370	best: 0.6343514 (375)	total: 4m 9s	remaining: 24m 44s
720:	learn: 0.7299498	test: 0.6301370	best: 0.6343514 (375)	total: 4m 10s	remaining: 24m 44s
721:	learn: 0.7299864	test: 0.6301370	best: 0.6343514 (375)	total: 4m 10s	remaining: 24m 43s
722:	learn: 0.7300122	test: 0.6299609	best: 0.6343514 (375)	total: 4m 10s	remaining: 24m 43s
723:	learn: 0.7302471	test: 0.6297539	best: 0.6343514 (375)	total: 4m 11s	remaining: 24m 42s
724:	learn: 0.7302471	test: 0.6297539	best: 0.6343514 (375)	total: 4m 11s	remaining: 24m 41s
725:	learn: 0.7302471	test: 0.6297539	best: 0.6343514 (375)	total: 4m 11s	remaining: 24m 41s
726:	learn: 0.7301609	test: 0.6297539	best: 0.6343514 (375)	total: 4m 11s	remaining: 24m 40s
727:	learn: 0.7302212	test: 0.6303132	best: 0.6343514 (375)	total: 4m 12s

806:	learn: 0.7342871	test: 0.6308164	best: 0.6343514 (375)	total: 4m 36s	remaining: 23m 57s
807:	learn: 0.7342871	test: 0.6308164	best: 0.6343514 (375)	total: 4m 36s	remaining: 23m 57s
808:	learn: 0.7343729	test: 0.6308164	best: 0.6343514 (375)	total: 4m 37s	remaining: 23m 56s
809:	learn: 0.7342510	test: 0.6311978	best: 0.6343514 (375)	total: 4m 37s	remaining: 23m 55s
810:	learn: 0.7342510	test: 0.6311978	best: 0.6343514 (375)	total: 4m 37s	remaining: 23m 55s
811:	learn: 0.7343008	test: 0.6311978	best: 0.6343514 (375)	total: 4m 38s	remaining: 23m 54s
812:	learn: 0.7343008	test: 0.6311978	best: 0.6343514 (375)	total: 4m 38s	remaining: 23m 53s
813:	learn: 0.7343008	test: 0.6311978	best: 0.6343514 (375)	total: 4m 38s	remaining: 23m 53s
814:	learn: 0.7343008	test: 0.6315789	best: 0.6343514 (375)	total: 4m 38s	remaining: 23m 52s
815:	learn: 0.7343008	test: 0.6317549	best: 0.6343514 (375)	total: 4m 39s	remaining: 23m 52s
816:	learn: 0.7345001	test: 0.6309324	best: 0.6343514 (375)	total: 4m 

895:	learn: 0.7396739	test: 0.6305804	best: 0.6343514 (375)	total: 5m 7s	remaining: 23m 29s
896:	learn: 0.7399660	test: 0.6299916	best: 0.6343514 (375)	total: 5m 8s	remaining: 23m 29s
897:	learn: 0.7399158	test: 0.6299916	best: 0.6343514 (375)	total: 5m 8s	remaining: 23m 28s
898:	learn: 0.7394022	test: 0.6299916	best: 0.6343514 (375)	total: 5m 8s	remaining: 23m 28s
899:	learn: 0.7394022	test: 0.6299916	best: 0.6343514 (375)	total: 5m 9s	remaining: 23m 27s
900:	learn: 0.7393725	test: 0.6303741	best: 0.6343514 (375)	total: 5m 9s	remaining: 23m 27s
901:	learn: 0.7394787	test: 0.6307563	best: 0.6343514 (375)	total: 5m 9s	remaining: 23m 27s
902:	learn: 0.7397000	test: 0.6307563	best: 0.6343514 (375)	total: 5m 10s	remaining: 23m 27s
903:	learn: 0.7397502	test: 0.6307563	best: 0.6343514 (375)	total: 5m 10s	remaining: 23m 26s
904:	learn: 0.7400923	test: 0.6307563	best: 0.6343514 (375)	total: 5m 10s	remaining: 23m 26s
905:	learn: 0.7399009	test: 0.6307263	best: 0.6343514 (375)	total: 5m 11s	rem

984:	learn: 0.7432341	test: 0.6299916	best: 0.6343514 (375)	total: 5m 45s	remaining: 23m 29s
985:	learn: 0.7433039	test: 0.6301676	best: 0.6343514 (375)	total: 5m 46s	remaining: 23m 29s
986:	learn: 0.7434904	test: 0.6301676	best: 0.6343514 (375)	total: 5m 46s	remaining: 23m 29s
987:	learn: 0.7435096	test: 0.6299916	best: 0.6343514 (375)	total: 5m 47s	remaining: 23m 29s
988:	learn: 0.7434591	test: 0.6299916	best: 0.6343514 (375)	total: 5m 47s	remaining: 23m 29s
989:	learn: 0.7438825	test: 0.6301982	best: 0.6343514 (375)	total: 5m 47s	remaining: 23m 29s
990:	learn: 0.7440185	test: 0.6301982	best: 0.6343514 (375)	total: 5m 48s	remaining: 23m 29s
991:	learn: 0.7440185	test: 0.6301982	best: 0.6343514 (375)	total: 5m 48s	remaining: 23m 28s
992:	learn: 0.7440691	test: 0.6303741	best: 0.6343514 (375)	total: 5m 49s	remaining: 23m 28s
993:	learn: 0.7440691	test: 0.6303741	best: 0.6343514 (375)	total: 5m 49s	remaining: 23m 28s
994:	learn: 0.7440691	test: 0.6303741	best: 0.6343514 (375)	total: 5m 

1072:	learn: 0.7488274	test: 0.6313145	best: 0.6343514 (375)	total: 6m 21s	remaining: 23m 16s
1073:	learn: 0.7488274	test: 0.6313145	best: 0.6343514 (375)	total: 6m 21s	remaining: 23m 16s
1074:	learn: 0.7489292	test: 0.6309324	best: 0.6343514 (375)	total: 6m 22s	remaining: 23m 17s
1075:	learn: 0.7489292	test: 0.6309324	best: 0.6343514 (375)	total: 6m 23s	remaining: 23m 17s
1076:	learn: 0.7492862	test: 0.6309324	best: 0.6343514 (375)	total: 6m 23s	remaining: 23m 18s
1077:	learn: 0.7493712	test: 0.6309324	best: 0.6343514 (375)	total: 6m 24s	remaining: 23m 19s
1078:	learn: 0.7492862	test: 0.6309324	best: 0.6343514 (375)	total: 6m 25s	remaining: 23m 19s
1079:	learn: 0.7492862	test: 0.6313145	best: 0.6343514 (375)	total: 6m 25s	remaining: 23m 20s
1080:	learn: 0.7492862	test: 0.6313145	best: 0.6343514 (375)	total: 6m 26s	remaining: 23m 20s
1081:	learn: 0.7492862	test: 0.6313145	best: 0.6343514 (375)	total: 6m 26s	remaining: 23m 21s
1082:	learn: 0.7492184	test: 0.6307563	best: 0.6343514 (375)

1160:	learn: 0.7522400	test: 0.6297849	best: 0.6343514 (375)	total: 7m 1s	remaining: 23m 12s
1161:	learn: 0.7522400	test: 0.6297849	best: 0.6343514 (375)	total: 7m 1s	remaining: 23m 12s
1162:	learn: 0.7522400	test: 0.6297849	best: 0.6343514 (375)	total: 7m 2s	remaining: 23m 12s
1163:	learn: 0.7523247	test: 0.6297849	best: 0.6343514 (375)	total: 7m 2s	remaining: 23m 12s
1164:	learn: 0.7522911	test: 0.6297849	best: 0.6343514 (375)	total: 7m 2s	remaining: 23m 11s
1165:	learn: 0.7523758	test: 0.6294331	best: 0.6343514 (375)	total: 7m 3s	remaining: 23m 11s
1166:	learn: 0.7523758	test: 0.6294331	best: 0.6343514 (375)	total: 7m 3s	remaining: 23m 10s
1167:	learn: 0.7522737	test: 0.6303437	best: 0.6343514 (375)	total: 7m 3s	remaining: 23m 10s
1168:	learn: 0.7522737	test: 0.6303437	best: 0.6343514 (375)	total: 7m 4s	remaining: 23m 10s
1169:	learn: 0.7522737	test: 0.6303437	best: 0.6343514 (375)	total: 7m 4s	remaining: 23m 9s
1170:	learn: 0.7527159	test: 0.6295779	best: 0.6343514 (375)	total: 7m 

1249:	learn: 0.7538514	test: 0.6279005	best: 0.6343514 (375)	total: 7m 37s	remaining: 22m 52s
1250:	learn: 0.7538514	test: 0.6279005	best: 0.6343514 (375)	total: 7m 37s	remaining: 22m 52s
1251:	learn: 0.7540205	test: 0.6290503	best: 0.6343514 (375)	total: 7m 38s	remaining: 22m 51s
1252:	learn: 0.7540205	test: 0.6290503	best: 0.6343514 (375)	total: 7m 38s	remaining: 22m 51s
1253:	learn: 0.7539693	test: 0.6290503	best: 0.6343514 (375)	total: 7m 38s	remaining: 22m 51s
1254:	learn: 0.7538848	test: 0.6290503	best: 0.6343514 (375)	total: 7m 39s	remaining: 22m 50s
1255:	learn: 0.7538159	test: 0.6288429	best: 0.6343514 (375)	total: 7m 39s	remaining: 22m 50s
1256:	learn: 0.7538159	test: 0.6288429	best: 0.6343514 (375)	total: 7m 40s	remaining: 22m 50s
1257:	learn: 0.7538159	test: 0.6288429	best: 0.6343514 (375)	total: 7m 40s	remaining: 22m 49s
1258:	learn: 0.7538315	test: 0.6297849	best: 0.6343514 (375)	total: 7m 40s	remaining: 22m 49s
1259:	learn: 0.7539672	test: 0.6301370	best: 0.6343514 (375)

1337:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 8s	remaining: 22m 17s
1338:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 9s	remaining: 22m 16s
1339:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 9s	remaining: 22m 16s
1340:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 9s	remaining: 22m 15s
1341:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 9s	remaining: 22m 15s
1342:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 10s	remaining: 22m 15s
1343:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 10s	remaining: 22m 14s
1344:	learn: 0.7554953	test: 0.6306961	best: 0.6343514 (375)	total: 8m 11s	remaining: 22m 14s
1345:	learn: 0.7556309	test: 0.6316378	best: 0.6343514 (375)	total: 8m 11s	remaining: 22m 14s
1346:	learn: 0.7556822	test: 0.6316378	best: 0.6343514 (375)	total: 8m 11s	remaining: 22m 13s
1347:	learn: 0.7556822	test: 0.6316378	best: 0.6343514 (375)	tota

1425:	learn: 0.7580831	test: 0.6312552	best: 0.6343514 (375)	total: 8m 41s	remaining: 21m 45s
1426:	learn: 0.7580317	test: 0.6312552	best: 0.6343514 (375)	total: 8m 41s	remaining: 21m 45s
1427:	learn: 0.7580317	test: 0.6312552	best: 0.6343514 (375)	total: 8m 41s	remaining: 21m 45s
1428:	learn: 0.7581345	test: 0.6310788	best: 0.6343514 (375)	total: 8m 42s	remaining: 21m 44s
1429:	learn: 0.7586300	test: 0.6295467	best: 0.6343514 (375)	total: 8m 42s	remaining: 21m 44s
1430:	learn: 0.7587142	test: 0.6295467	best: 0.6343514 (375)	total: 8m 42s	remaining: 21m 43s
1431:	learn: 0.7586300	test: 0.6295467	best: 0.6343514 (375)	total: 8m 43s	remaining: 21m 43s
1432:	learn: 0.7587984	test: 0.6293706	best: 0.6343514 (375)	total: 8m 43s	remaining: 21m 42s
1433:	learn: 0.7586628	test: 0.6301370	best: 0.6343514 (375)	total: 8m 43s	remaining: 21m 42s
1434:	learn: 0.7588965	test: 0.6291946	best: 0.6343514 (375)	total: 8m 44s	remaining: 21m 42s
1435:	learn: 0.7588124	test: 0.6291946	best: 0.6343514 (375)

1513:	learn: 0.7622984	test: 0.6298157	best: 0.6343514 (375)	total: 9m 11s	remaining: 21m 10s
1514:	learn: 0.7626855	test: 0.6299916	best: 0.6343514 (375)	total: 9m 12s	remaining: 21m 10s
1515:	learn: 0.7630207	test: 0.6296089	best: 0.6343514 (375)	total: 9m 12s	remaining: 21m 10s
1516:	learn: 0.7630011	test: 0.6296089	best: 0.6343514 (375)	total: 9m 13s	remaining: 21m 9s
1517:	learn: 0.7628852	test: 0.6292260	best: 0.6343514 (375)	total: 9m 13s	remaining: 21m 9s
1518:	learn: 0.7628852	test: 0.6292260	best: 0.6343514 (375)	total: 9m 13s	remaining: 21m 9s
1519:	learn: 0.7628852	test: 0.6292260	best: 0.6343514 (375)	total: 9m 14s	remaining: 21m 8s
1520:	learn: 0.7630402	test: 0.6305501	best: 0.6343514 (375)	total: 9m 14s	remaining: 21m 8s
1521:	learn: 0.7631240	test: 0.6305501	best: 0.6343514 (375)	total: 9m 14s	remaining: 21m 7s
1522:	learn: 0.7632523	test: 0.6297849	best: 0.6343514 (375)	total: 9m 15s	remaining: 21m 7s
1523:	learn: 0.7633360	test: 0.6296089	best: 0.6343514 (375)	total:

1601:	learn: 0.7661651	test: 0.6298992	best: 0.6343514 (375)	total: 9m 44s	remaining: 20m 40s
1602:	learn: 0.7658940	test: 0.6300756	best: 0.6343514 (375)	total: 9m 44s	remaining: 20m 39s
1603:	learn: 0.7655593	test: 0.6304287	best: 0.6343514 (375)	total: 9m 45s	remaining: 20m 39s
1604:	learn: 0.7656430	test: 0.6304287	best: 0.6343514 (375)	total: 9m 45s	remaining: 20m 38s
1605:	learn: 0.7655911	test: 0.6304287	best: 0.6343514 (375)	total: 9m 46s	remaining: 20m 38s
1606:	learn: 0.7656430	test: 0.6304287	best: 0.6343514 (375)	total: 9m 46s	remaining: 20m 38s
1607:	learn: 0.7656430	test: 0.6304287	best: 0.6343514 (375)	total: 9m 46s	remaining: 20m 37s
1608:	learn: 0.7656430	test: 0.6304287	best: 0.6343514 (375)	total: 9m 47s	remaining: 20m 37s
1609:	learn: 0.7658738	test: 0.6304287	best: 0.6343514 (375)	total: 9m 47s	remaining: 20m 37s
1610:	learn: 0.7658738	test: 0.6304287	best: 0.6343514 (375)	total: 9m 47s	remaining: 20m 36s
1611:	learn: 0.7659055	test: 0.6304287	best: 0.6343514 (375)

1688:	learn: 0.7673978	test: 0.6297849	best: 0.6343514 (375)	total: 10m 19s	remaining: 20m 13s
1689:	learn: 0.7674497	test: 0.6297849	best: 0.6343514 (375)	total: 10m 19s	remaining: 20m 13s
1690:	learn: 0.7674497	test: 0.6297849	best: 0.6343514 (375)	total: 10m 19s	remaining: 20m 12s
1691:	learn: 0.7676166	test: 0.6299609	best: 0.6343514 (375)	total: 10m 20s	remaining: 20m 12s
1692:	learn: 0.7676166	test: 0.6299609	best: 0.6343514 (375)	total: 10m 20s	remaining: 20m 12s
1693:	learn: 0.7676166	test: 0.6299609	best: 0.6343514 (375)	total: 10m 21s	remaining: 20m 12s
1694:	learn: 0.7677000	test: 0.6303437	best: 0.6343514 (375)	total: 10m 21s	remaining: 20m 12s
1695:	learn: 0.7683455	test: 0.6305198	best: 0.6343514 (375)	total: 10m 22s	remaining: 20m 12s
1696:	learn: 0.7681787	test: 0.6303437	best: 0.6343514 (375)	total: 10m 22s	remaining: 20m 11s
1697:	learn: 0.7681581	test: 0.6299916	best: 0.6343514 (375)	total: 10m 23s	remaining: 20m 11s
1698:	learn: 0.7681061	test: 0.6303741	best: 0.634

1775:	learn: 0.7718775	test: 0.6265060	best: 0.6343514 (375)	total: 10m 54s	remaining: 19m 48s
1776:	learn: 0.7718775	test: 0.6265060	best: 0.6343514 (375)	total: 10m 54s	remaining: 19m 47s
1777:	learn: 0.7718775	test: 0.6265060	best: 0.6343514 (375)	total: 10m 55s	remaining: 19m 47s
1778:	learn: 0.7719512	test: 0.6274510	best: 0.6343514 (375)	total: 10m 55s	remaining: 19m 47s
1779:	learn: 0.7720130	test: 0.6263644	best: 0.6343514 (375)	total: 10m 56s	remaining: 19m 46s
1780:	learn: 0.7721485	test: 0.6270997	best: 0.6343514 (375)	total: 10m 56s	remaining: 19m 46s
1781:	learn: 0.7721485	test: 0.6270997	best: 0.6343514 (375)	total: 10m 57s	remaining: 19m 46s
1782:	learn: 0.7718371	test: 0.6265398	best: 0.6343514 (375)	total: 10m 57s	remaining: 19m 46s
1783:	learn: 0.7718371	test: 0.6265398	best: 0.6343514 (375)	total: 10m 57s	remaining: 19m 45s
1784:	learn: 0.7727427	test: 0.6290187	best: 0.6343514 (375)	total: 10m 58s	remaining: 19m 45s
1785:	learn: 0.7728258	test: 0.6290187	best: 0.634

1862:	learn: 0.7742722	test: 0.6292887	best: 0.6343514 (375)	total: 11m 33s	remaining: 19m 28s
1863:	learn: 0.7744381	test: 0.6291132	best: 0.6343514 (375)	total: 11m 34s	remaining: 19m 28s
1864:	learn: 0.7742809	test: 0.6294954	best: 0.6343514 (375)	total: 11m 34s	remaining: 19m 28s
1865:	learn: 0.7745649	test: 0.6283482	best: 0.6343514 (375)	total: 11m 35s	remaining: 19m 27s
1866:	learn: 0.7748138	test: 0.6277902	best: 0.6343514 (375)	total: 11m 35s	remaining: 19m 27s
1867:	learn: 0.7746870	test: 0.6281729	best: 0.6343514 (375)	total: 11m 36s	remaining: 19m 27s
1868:	learn: 0.7746870	test: 0.6281729	best: 0.6343514 (375)	total: 11m 36s	remaining: 19m 26s
1869:	learn: 0.7746870	test: 0.6281729	best: 0.6343514 (375)	total: 11m 36s	remaining: 19m 26s
1870:	learn: 0.7747223	test: 0.6295575	best: 0.6343514 (375)	total: 11m 37s	remaining: 19m 26s
1871:	learn: 0.7746698	test: 0.6293512	best: 0.6343514 (375)	total: 11m 37s	remaining: 19m 25s
1872:	learn: 0.7745344	test: 0.6295575	best: 0.634

1949:	learn: 0.7773186	test: 0.6279654	best: 0.6343514 (375)	total: 12m 10s	remaining: 19m 2s
1950:	learn: 0.7773186	test: 0.6279654	best: 0.6343514 (375)	total: 12m 11s	remaining: 19m 2s
1951:	learn: 0.7773186	test: 0.6279654	best: 0.6343514 (375)	total: 12m 11s	remaining: 19m 2s
1952:	learn: 0.7770174	test: 0.6289378	best: 0.6343514 (375)	total: 12m 12s	remaining: 19m 2s
1953:	learn: 0.7771607	test: 0.6287625	best: 0.6343514 (375)	total: 12m 12s	remaining: 19m 1s
1954:	learn: 0.7772133	test: 0.6287625	best: 0.6343514 (375)	total: 12m 12s	remaining: 19m 1s
1955:	learn: 0.7772133	test: 0.6287625	best: 0.6343514 (375)	total: 12m 13s	remaining: 19m 1s
1956:	learn: 0.7772133	test: 0.6287625	best: 0.6343514 (375)	total: 12m 13s	remaining: 19m
1957:	learn: 0.7776197	test: 0.6293512	best: 0.6343514 (375)	total: 12m 14s	remaining: 19m
1958:	learn: 0.7775896	test: 0.6293512	best: 0.6343514 (375)	total: 12m 14s	remaining: 19m
1959:	learn: 0.7775896	test: 0.6293512	best: 0.6343514 (375)	total: 1

In [175]:
pred = model.predict(train2[cat_cols+cnt_cols].iloc[test_index2])
print (accuracy_score(train2.RESULT.iloc[test_index2],pred))
print (confusion_matrix(train2.RESULT.iloc[test_index2],pred))
print (f1_score(train2.RESULT.iloc[test_index2],pred))

0.6833088595203133
[[1579  527]
 [ 767 1213]]
0.6521505376344087


In [176]:
testpred = model.predict(test[cat_cols+cnt_cols])
submission1 = pd.read_csv('../data/CAX_MortgageModeling_SubmissionFormat.csv')
submission1.Result_Predicted = testpred
submission1.Result_Predicted = submission1.Result_Predicted.apply(lambda x: "FUNDED" if x == 0 else "NOT FUNDED")
print (submission1.Result_Predicted.value_counts(normalize=True))
submission1.to_csv('../submissions/submission20.csv',index=False)

FUNDED        0.675299
NOT FUNDED    0.324701
Name: Result_Predicted, dtype: float64
