In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import *

In [2]:
MISSING = -9999

In [3]:
df = pd.read_csv("../data/processed/data_encoded.csv")

In [4]:
df.shape

(39325, 1566)

In [5]:
df_train = df[df["test"] == 0].drop(columns=["test"])
df_test = df[df["test"] == 1].drop(columns=["test"])

In [6]:
X = df_train.drop(columns=["id", "satisfied"]).values
y = df_train["satisfied"].values

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1001)

In [8]:
X_final = df_test.drop(columns=["id", "satisfied"]).values

In [9]:
X_train.shape

(21056, 1563)

In [10]:
X_test.shape

(9024, 1563)

In [11]:
X_final.shape

(9245, 1563)

In [14]:
params = {
    'subsample': 1.0,
    'scale_pos_weight': 1.0,
    'max_depth': 5,
    'colsample_bytree': 0.5,
    'colsample_bynode': 1.0,
    'colsample_bylevel': 0.5
}

In [15]:
final_model = xgb.XGBClassifier(
    learning_rate=0.1,
    objective='binary:logistic',
    silent=True, 
    missing=MISSING,
    nthread=8,
    **params
    )

In [16]:
final_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
              colsample_bynode=1.0, colsample_bytree=0.5, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=-9999, n_estimators=100, n_jobs=1,
              nthread=8, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1.0, seed=None,
              silent=True, subsample=1.0, verbosity=1)

In [17]:
y_pred_train = final_model.predict(X_train)
y_prob_train = final_model.predict_proba(X_train)[:,1]

In [18]:
y_pred_test = final_model.predict(X_test)
y_prob_test = final_model.predict_proba(X_test)[:,1]

In [19]:
roc_auc_score(y_test, y_prob_test), roc_auc_score(y_train, y_prob_train)

(0.8899624722473075, 0.9317559897870336)

In [None]:
(0.8901941061759845, 0.9386534084249205)  #5

In [None]:
(0.8901553362068626, 0.9203028581243273) #4

In [None]:
(0.8896688507329076, 0.9068574909384938) #3

In [None]:
(0.8879588067229542, 0.8971857360901431) #2

In [20]:
y_pred_final = final_model.predict(X_final)
y_prob_final = final_model.predict_proba(X_final)[:,1]

In [21]:
df_result = pd.DataFrame(y_prob_final, columns=["Predicted"])

In [22]:
df_result["Id"] = df_test["id"].values

In [23]:
df_result[["Id", "Predicted"]]

Unnamed: 0,Id,Predicted
0,14061,0.075775
1,16467,0.119587
2,25725,0.909530
3,9100,0.810799
4,32597,0.631361
...,...,...
9240,31430,0.895970
9241,3876,0.707639
9242,20710,0.743036
9243,26015,0.708120


In [24]:
df_result[["Id", "Predicted"]].to_csv("../data/result/result_03_03_submission2.csv", index=False)