In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from catboost import CatBoostClassifier
from sklearn.metrics import *

In [2]:
MISSING = -9999

In [3]:
df = pd.read_csv("../data/processed/data_encoded.csv")

In [4]:
df_train = df[df["test"] == 0].drop(columns=["test"]).set_index("id")
df_test = df[df["test"] == 1].drop(columns=["test"]).set_index("id")

In [5]:
X = df_train.drop(columns=["satisfied"])
y = df_train["satisfied"]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1001)
# X_train_id, X_test_id, _, _ = train_test_split(X_id, y, test_size=0.30, random_state=1001)

In [7]:
X_final = df_test.drop(columns=["satisfied"])

In [8]:
categorical = ['v17', 'v20', 'v25', 'v78', 'v154', 'v155', 'v161', 'cntry']
categorical += ['v70', 'v71', 'v72', 'v73', 'v102', 'v103', 
                           'v158', 'v159', 'v160', 'v163', 'v164', 
                           'v169', 'v170', 'v190', 'v191',
                          'v216', 'v231']

In [9]:
model = CatBoostClassifier(iterations=500,
                                learning_rate=0.1,
                                cat_features=categorical,
                                depth=2,
                                loss_function='CrossEntropy')

In [10]:
model

<catboost.core.CatBoostClassifier at 0x7f259b96d090>

In [11]:
params = {
    'depth': [2, 3, 4, 5, 8, 10],
    'l2_leaf_reg': [1, 3, 4, 5, 9],
}

In [12]:
folds = 3
param_comb = 4

skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)

random_search = RandomizedSearchCV(model, 
                                   param_distributions=params, 
                                   n_iter=param_comb, 
                                   scoring='roc_auc', 
                                   n_jobs=-1, 
                                   cv=skf.split(X_train,y_train), 
                                   verbose=3, 
                                   random_state=1001 )

In [None]:
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 64 concurrent workers.


In [None]:
df_scores = pd.DataFrame(random_search.cv_results_).sort_values("mean_test_score", ascending=False)

In [None]:
df_scores.to_csv("../data/result/tuning_scores_03_07.csv", index=False)

In [None]:
df_scores

In [None]:
random_search.best_params_

In [None]:
final_model = random_search.best_estimator_

In [20]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [21]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

In [22]:
roc_auc_score(y_test, y_prob), roc_auc_score(y_train, y_prob_train)

(0.8907292697739166, 0.9325559750856212)

In [None]:
df_scores = pd.read_csv("../data/result/tuning_scores_03_07.csv")

In [None]:
df_scores

In [45]:
params = {"max_depth": 6}

In [46]:
final_model = xgb.XGBClassifier(
    learning_rate=0.1,
    objective='binary:logistic',
    silent=True, 
    missing=MISSING,
    nthread=8,
    **params
    )

In [47]:
final_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=-9999, n_estimators=100, n_jobs=1,
              nthread=8, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=True, subsample=1, verbosity=1)

In [48]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [49]:
y_pred = final_model.predict(X_test)
y_prob = final_model.predict_proba(X_test)[:,1]

In [50]:
roc_auc_score(y_test, y_prob), roc_auc_score(y_train, y_prob_train)

(0.8879432548687523, 0.950724064047026)

In [23]:
0.8833639755978565 # baseline (fill missing)

0.8833639755978565

In [24]:
0.8839356100336466 # categorical variable

0.8839356100336466

In [25]:
0.8839356100336466 # type of missing using one-hot + indicator of missing

0.8839356100336466

In [26]:
pd.Series(final_model.feature_importances_, index=df.drop(columns=["id", "satisfied"]).columns).sort_values(ascending=False)

v98         0.267800
v79         0.100945
v101        0.046722
v224        0.040056
v223        0.028993
              ...   
v154_MON    0.000000
v154_NAP    0.000000
v154_NEP    0.000000
v154_NOR    0.000000
v1          0.000000
Length: 1507, dtype: float32