In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import *

In [2]:
MISSING = -9999

In [3]:
df = pd.read_csv("../data/processed/data_encoded.csv")

In [4]:
# df = df.iloc[:13000]

In [5]:
# df = df.iloc[13000:]

In [6]:
df_train = df[df["test"] == 0].drop(columns=["test"])

In [7]:
X = df_train.drop(columns=["id", "satisfied"]).values
y = df_train["satisfied"].values

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1001)

In [9]:
X_train.shape

(21056, 1563)

In [10]:
X_test.shape

(9024, 1563)

In [11]:
model = xgb.XGBClassifier(learning_rate=0.1, 
                          objective='binary:logistic',
                          silent=True, 
                          missing=MISSING,
                          nthread=1)

In [12]:
model

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=-9999, n_estimators=100, n_jobs=1,
              nthread=1, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)

In [13]:
params = {
    'max_depth': [2, 3, 4, 5],
    'subsample': [0.25, 0.5, 1.],
    'colsample_bytree': [0.5, 1.],
    'colsample_bylevel': [0.5, 1.],
    'colsample_bynode': [0.5, 1.],
    'scale_pos_weight': [0.5, 1., 1.5],
}

In [14]:
folds = 3
param_comb = 300

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='roc_auc', 
                                   n_jobs=-1, 
                                   cv=skf.split(X_train,y_train), 
                                   verbose=3, 
                                   random_state=1001 )

In [15]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.
[Parallel(n_jobs=-1)]: Done 160 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 384 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 672 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  9.8min finished


RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x7f0d0e192150>,
                   error_score='raise-deprecating',
                   estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                           colsample_bylevel=1,
                                           colsample_bynode=1,
                                           colsample_bytree=1, gamma=0,
                                           learning_rate=0.1, max_delta_step=0,
                                           max_depth=3, min_child_weight=1,
                                           missing=-9999, n_estimators=100,
                                           n_jobs=1, nthread=1,
                                           objectiv...
                                           seed=None, silent=True, subsample=1,
                                           verbosity=1),
                   iid='warn', n_iter=300, n_jobs=-1,
                   param_distributions={'colsample_by

In [16]:
df_scores = pd.DataFrame(random_search.cv_results_).sort_values("mean_test_score", ascending=False)

In [17]:
df_scores.to_csv("../data/result/tuning_scores.csv", index=False)

In [None]:
df_scores

In [18]:
random_search.best_params_

{'subsample': 1.0,
 'scale_pos_weight': 1.0,
 'max_depth': 5,
 'colsample_bytree': 0.5,
 'colsample_bynode': 1.0,
 'colsample_bylevel': 0.5}

In [19]:
final_model = random_search.best_estimator_

In [20]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [21]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

In [22]:
roc_auc_score(y_test, y_prob), roc_auc_score(y_train, y_prob_train)

(0.8907292697739166, 0.9325559750856212)

In [4]:
df_scores = pd.read_csv("../data/result/tuning_scores.csv")

In [5]:
df_scores

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_scale_pos_weight,param_max_depth,param_colsample_bytree,param_colsample_bynode,param_colsample_bylevel,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,33.390213,0.550079,0.285864,0.048719,1.00,1.0,5,0.5,1.0,0.5,"{'subsample': 1.0, 'scale_pos_weight': 1.0, 'm...",0.890578,0.895939,0.897133,0.894550,0.002851,1
1,35.942610,0.504362,0.279281,0.035230,1.00,1.0,5,0.5,0.5,1.0,"{'subsample': 1.0, 'scale_pos_weight': 1.0, 'm...",0.890578,0.895939,0.897133,0.894550,0.002851,1
2,33.463708,0.607725,0.238096,0.014476,1.00,0.5,5,1.0,0.5,0.5,"{'subsample': 1.0, 'scale_pos_weight': 0.5, 'm...",0.890876,0.895600,0.897116,0.894530,0.002657,3
3,29.643606,0.822510,0.232705,0.003196,1.00,0.5,4,1.0,0.5,0.5,"{'subsample': 1.0, 'scale_pos_weight': 0.5, 'm...",0.890249,0.895997,0.897196,0.894480,0.003032,4
4,21.800600,0.514589,0.424465,0.212107,1.00,1.5,5,0.5,0.5,0.5,"{'subsample': 1.0, 'scale_pos_weight': 1.5, 'm...",0.891352,0.896552,0.895527,0.894477,0.002249,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
283,52.654004,2.129410,0.477463,0.290068,0.25,1.0,5,1.0,1.0,0.5,"{'subsample': 0.25, 'scale_pos_weight': 1.0, '...",0.884359,0.889725,0.889509,0.887864,0.002481,283
284,76.666680,2.653871,0.163980,0.000524,0.25,1.5,5,1.0,1.0,1.0,"{'subsample': 0.25, 'scale_pos_weight': 1.5, '...",0.884026,0.889627,0.889608,0.887754,0.002636,285
285,53.535069,1.344616,0.263582,0.011414,0.25,1.5,5,1.0,1.0,0.5,"{'subsample': 0.25, 'scale_pos_weight': 1.5, '...",0.883275,0.888988,0.889885,0.887382,0.002928,286
286,52.753235,0.932617,0.287539,0.064486,0.25,1.5,5,1.0,0.5,1.0,"{'subsample': 0.25, 'scale_pos_weight': 1.5, '...",0.883275,0.888988,0.889885,0.887382,0.002928,286


In [45]:
params = {"max_depth": 6}

In [46]:
final_model = xgb.XGBClassifier(
    learning_rate=0.1,
    objective='binary:logistic',
    silent=True, 
    missing=MISSING,
    nthread=8,
    **params
    )

In [47]:
final_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=-9999, n_estimators=100, n_jobs=1,
              nthread=8, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)

In [48]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [49]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

In [50]:
roc_auc_score(y_test, y_prob), roc_auc_score(y_train, y_prob_train)

(0.8879432548687523, 0.950724064047026)

In [23]:
0.8833639755978565 # baseline (fill missing)

0.8833639755978565

In [24]:
0.8839356100336466 # categorical variable

0.8839356100336466

In [25]:
0.8839356100336466 # type of missing using one-hot + indicator of missing

0.8839356100336466

In [26]:
pd.Series(final_model.feature_importances_, index=df.drop(columns=["id", "satisfied"]).columns).sort_values(ascending=False)

v98         0.267800
v79         0.100945
v101        0.046722
v224        0.040056
v223        0.028993
              ...   
v154_MON    0.000000
v154_NAP    0.000000
v154_NEP    0.000000
v154_NOR    0.000000
v1          0.000000
Length: 1507, dtype: float32