In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import *

In [2]:
MISSING = -9999

In [3]:
df = pd.read_csv("../data/processed/data_encoded.csv")

In [6]:
df.shape

(39325, 274)

In [7]:
df

Unnamed: 0,id,v1,v2,v3,v4,v5,v6,v7,v8,v9,...,v264,v265,v266,v267,v268,v269,v270,cntry,satisfied,test
0,9948,2.0,2.0,74.0,11010.0,,0.0,0.0,0.0,0.0,...,,,,,,,,AT,0.0,0
1,25601,4.0,2.0,58.0,11010.0,,0.0,0.0,0.0,0.0,...,,,,,,,,AT,0.0,0
2,8592,6.0,2.0,47.0,11010.0,11010.0,0.0,0.0,1.0,0.0,...,1993.0,1995.0,,,,,,AT,1.0,0
3,29593,10.0,2.0,22.0,11010.0,,0.0,0.0,0.0,0.0,...,,,,,,,,AT,0.0,0
4,4252,0.0,1.0,24.0,11010.0,,0.0,0.0,0.0,0.0,...,,,,,,,,AT,1.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39320,31430,0.0,2.0,27.0,14120.0,,0.0,0.0,0.0,0.0,...,2010.0,2014.0,,,,,,SI,,1
39321,3876,1.0,3.0,82.0,14120.0,,0.0,0.0,0.0,0.0,...,,,,,,,,SI,,1
39322,20710,5.0,1.0,41.0,14120.0,,0.0,0.0,0.0,0.0,...,1998.0,2001.0,,,,,,SI,,1
39323,26015,8.0,1.0,19.0,14120.0,,0.0,1.0,0.0,1.0,...,1974.0,,,,,,,SI,,1


In [8]:
df_train = df[df["test"] == 0].drop(columns=["test"]).set_index("id")
df_test = df[df["test"] == 1].drop(columns=["test"]).set_index("id")

In [9]:
X = df_train.drop(columns=["satisfied"])
y = df_train["satisfied"]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1001)
# X_train_id, X_test_id, _, _ = train_test_split(X_id, y, test_size=0.30, random_state=1001)

In [11]:
X_final = df_test.drop(columns=["satisfied"])

In [12]:
categorical = ['v17', 'v20', 'v25', 'v78', 'v154', 'v155', 'v161', 'cntry']
categorical += ['v70', 'v71', 'v72', 'v73', 'v102', 'v103', 
                           'v158', 'v159', 'v160', 'v163', 'v164', 
                           'v169', 'v170', 'v190', 'v191',
                          'v216', 'v231']

In [14]:
model = CatBoostClassifier(iterations=500,
                                learning_rate=0.1,
                                cat_features=categorical,
                                loss_function='CrossEntropy')

In [15]:
model

<catboost.core.CatBoostClassifier at 0x10fae0f10>

In [16]:
params = {
    'depth': [2, 3, 4, 5, 8, 10],
    'l2_leaf_reg': [1, 3, 4, 5, 9],
}

In [17]:
folds = 3
param_comb = 20

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='roc_auc', 
                                   n_jobs=-1, 
                                   cv=skf.split(X_train,y_train), 
                                   verbose=3, 
                                   random_state=1001 )

In [18]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 22.5min finished


0:	learn: 0.6498250	total: 88ms	remaining: 43.9s
1:	learn: 0.6136519	total: 108ms	remaining: 26.8s
2:	learn: 0.5839152	total: 132ms	remaining: 21.8s
3:	learn: 0.5565719	total: 148ms	remaining: 18.3s
4:	learn: 0.5358831	total: 168ms	remaining: 16.6s
5:	learn: 0.5196365	total: 185ms	remaining: 15.3s
6:	learn: 0.5070470	total: 206ms	remaining: 14.5s
7:	learn: 0.4964636	total: 223ms	remaining: 13.7s
8:	learn: 0.4863950	total: 243ms	remaining: 13.2s
9:	learn: 0.4779192	total: 260ms	remaining: 12.8s
10:	learn: 0.4713493	total: 278ms	remaining: 12.4s
11:	learn: 0.4648275	total: 301ms	remaining: 12.3s
12:	learn: 0.4595840	total: 326ms	remaining: 12.2s
13:	learn: 0.4560457	total: 343ms	remaining: 11.9s
14:	learn: 0.4525999	total: 360ms	remaining: 11.6s
15:	learn: 0.4491562	total: 380ms	remaining: 11.5s
16:	learn: 0.4460044	total: 397ms	remaining: 11.3s
17:	learn: 0.4434745	total: 415ms	remaining: 11.1s
18:	learn: 0.4417626	total: 432ms	remaining: 10.9s
19:	learn: 0.4400937	total: 448ms	remainin

165:	learn: 0.4003812	total: 3.25s	remaining: 6.53s
166:	learn: 0.4003140	total: 3.27s	remaining: 6.51s
167:	learn: 0.4002292	total: 3.28s	remaining: 6.49s
168:	learn: 0.4001250	total: 3.3s	remaining: 6.46s
169:	learn: 0.4000229	total: 3.32s	remaining: 6.44s
170:	learn: 0.3998439	total: 3.34s	remaining: 6.42s
171:	learn: 0.3997360	total: 3.36s	remaining: 6.4s
172:	learn: 0.3995546	total: 3.38s	remaining: 6.38s
173:	learn: 0.3994492	total: 3.4s	remaining: 6.37s
174:	learn: 0.3993012	total: 3.42s	remaining: 6.35s
175:	learn: 0.3992242	total: 3.44s	remaining: 6.33s
176:	learn: 0.3991364	total: 3.46s	remaining: 6.32s
177:	learn: 0.3990481	total: 3.48s	remaining: 6.3s
178:	learn: 0.3989118	total: 3.5s	remaining: 6.28s
179:	learn: 0.3987354	total: 3.52s	remaining: 6.26s
180:	learn: 0.3986598	total: 3.54s	remaining: 6.24s
181:	learn: 0.3986322	total: 3.56s	remaining: 6.22s
182:	learn: 0.3985145	total: 3.58s	remaining: 6.2s
183:	learn: 0.3984521	total: 3.6s	remaining: 6.18s
184:	learn: 0.39833

327:	learn: 0.3870767	total: 6.46s	remaining: 3.39s
328:	learn: 0.3870534	total: 6.49s	remaining: 3.37s
329:	learn: 0.3869326	total: 6.5s	remaining: 3.35s
330:	learn: 0.3867802	total: 6.52s	remaining: 3.33s
331:	learn: 0.3867800	total: 6.54s	remaining: 3.31s
332:	learn: 0.3866218	total: 6.56s	remaining: 3.29s
333:	learn: 0.3865652	total: 6.58s	remaining: 3.27s
334:	learn: 0.3864499	total: 6.6s	remaining: 3.25s
335:	learn: 0.3864348	total: 6.62s	remaining: 3.23s
336:	learn: 0.3863028	total: 6.64s	remaining: 3.21s
337:	learn: 0.3862562	total: 6.66s	remaining: 3.19s
338:	learn: 0.3862091	total: 6.68s	remaining: 3.17s
339:	learn: 0.3862015	total: 6.71s	remaining: 3.16s
340:	learn: 0.3861585	total: 6.73s	remaining: 3.14s
341:	learn: 0.3860366	total: 6.76s	remaining: 3.12s
342:	learn: 0.3859569	total: 6.79s	remaining: 3.11s
343:	learn: 0.3858489	total: 6.81s	remaining: 3.09s
344:	learn: 0.3857909	total: 6.84s	remaining: 3.07s
345:	learn: 0.3857048	total: 6.86s	remaining: 3.05s
346:	learn: 0.

496:	learn: 0.3748151	total: 9.82s	remaining: 59.3ms
497:	learn: 0.3747937	total: 9.84s	remaining: 39.5ms
498:	learn: 0.3747686	total: 9.86s	remaining: 19.8ms
499:	learn: 0.3747549	total: 9.88s	remaining: 0us


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x10ed4b4d0>,
                   error_score=nan,
                   estimator=<catboost.core.CatBoostClassifier object at 0x10fae0f10>,
                   iid='deprecated', n_iter=20, n_jobs=-1,
                   param_distributions={'depth': [2, 3, 4, 5, 8, 10],
                                        'l2_leaf_reg': [1, 3, 4, 5, 9]},
                   pre_dispatch='2*n_jobs', random_state=1001, refit=True,
                   return_train_score=False, scoring='roc_auc', verbose=3)

In [19]:
df_scores = pd.DataFrame(random_search.cv_results_).sort_values("mean_test_score", ascending=False)

In [20]:
df_scores.to_csv("../data/result/tuning_scores_03_07_local.csv", index=False)

In [22]:
df_scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_l2_leaf_reg,param_depth,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
16,111.23326,1.481184,0.134068,0.002508,3,3,"{'l2_leaf_reg': 3, 'depth': 3}",0.893432,0.892199,0.89535,0.89366,0.001296,1
10,78.913835,6.583619,0.127325,0.002188,4,2,"{'l2_leaf_reg': 4, 'depth': 2}",0.892999,0.892109,0.895225,0.893444,0.001311,2
2,119.636265,1.34494,0.137124,0.004432,1,4,"{'l2_leaf_reg': 1, 'depth': 4}",0.892596,0.891844,0.895409,0.893283,0.001534,3
4,91.262704,2.345841,0.128871,0.001664,4,3,"{'l2_leaf_reg': 4, 'depth': 3}",0.892596,0.891898,0.89523,0.893241,0.001435,4
8,175.334679,3.226587,0.138276,0.00514,9,5,"{'l2_leaf_reg': 9, 'depth': 5}",0.892634,0.892512,0.894565,0.893237,0.00094,5
11,68.391465,1.299253,0.1282,0.001903,1,2,"{'l2_leaf_reg': 1, 'depth': 2}",0.892716,0.891809,0.895021,0.893182,0.001352,6
5,122.673431,3.774446,0.132624,0.000513,3,4,"{'l2_leaf_reg': 3, 'depth': 4}",0.893403,0.891649,0.894436,0.893163,0.00115,7
12,70.628919,0.348666,0.131031,0.00125,3,2,"{'l2_leaf_reg': 3, 'depth': 2}",0.892428,0.891777,0.894889,0.893031,0.00134,8
3,138.079429,1.118814,0.132056,0.002585,3,5,"{'l2_leaf_reg': 3, 'depth': 5}",0.892606,0.891629,0.894835,0.893023,0.001342,9
18,118.397828,1.251387,0.134556,0.000913,1,3,"{'l2_leaf_reg': 1, 'depth': 3}",0.893207,0.891163,0.894603,0.892991,0.001413,10


In [None]:
random_search.best_params_

In [None]:
final_model = random_search.best_estimator_

In [20]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [21]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

In [22]:
roc_auc_score(y_test, y_prob), roc_auc_score(y_train, y_prob_train)

(0.8907292697739166, 0.9325559750856212)

In [None]:
df_scores = pd.read_csv("../data/result/tuning_scores_03_07.csv")

In [None]:
df_scores

In [45]:
params = {"max_depth": 6}

In [46]:
final_model = xgb.XGBClassifier(
    learning_rate=0.1,
    objective='binary:logistic',
    silent=True, 
    missing=MISSING,
    nthread=8,
    **params
    )

In [47]:
final_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=-9999, n_estimators=100, n_jobs=1,
              nthread=8, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)

In [48]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [49]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

In [50]:
roc_auc_score(y_test, y_prob), roc_auc_score(y_train, y_prob_train)

(0.8879432548687523, 0.950724064047026)

In [23]:
0.8833639755978565 # baseline (fill missing)

0.8833639755978565

In [24]:
0.8839356100336466 # categorical variable

0.8839356100336466

In [25]:
0.8839356100336466 # type of missing using one-hot + indicator of missing

0.8839356100336466

In [26]:
pd.Series(final_model.feature_importances_, index=df.drop(columns=["id", "satisfied"]).columns).sort_values(ascending=False)

v98         0.267800
v79         0.100945
v101        0.046722
v224        0.040056
v223        0.028993
              ...   
v154_MON    0.000000
v154_NAP    0.000000
v154_NEP    0.000000
v154_NOR    0.000000
v1          0.000000
Length: 1507, dtype: float32