In [17]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, loguniform
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
import numpy as np
from hyperopt.pyll import scope
from sklearn.preprocessing import StandardScaler


In [38]:
data = pd.read_csv("./data/competition_data.csv")

data.drop(columns=['benefit', 'category_id', 'deal_print_id','etl_version', 'full_name','product_id'
                   , 'item_id', 'main_picture', 'site_id', 'uid', 'user_id', 'title', 'tags', 'warranty'],
                     axis= "columns", inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
#data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp", 'date', "domain_id"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [39]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])
y_test = test_data["conversion"]

In [40]:
space={
        "n_estimators": hp.choice("n_estimators",np.arange(32, 264, 8, dtype=int)),                   # tune 32 - 256
        "eta":hp.uniform("eta",0.01,0.9),                                   # learning rate # tune 0.01 - 0.9
        "gamma":hp.uniform("gamma",0.01,0.9),                               # tune 0 - 0.9
        "max_depth":hp.choice("max_depth", np.arange(3, 18, 1, dtype=int)),                       # tune 6 - 18
        "min_child_weight":hp.quniform('min_child_weight', 0, 10, 1),       # tune 0 - 10
        "subsample":hp.uniform("subsample",0.5,1),                          # tune 0.5 - 1
        "colsample_bytree":hp.uniform("colsample_bytree",0,1),              # tune 0- 1
        "colsample_bylevel":hp.uniform("colsample_bylevel",0,1),             # tune 0- 1
        "colsample_bynode":hp.uniform("colsample_bynode",0,1),              # tune 0- 1
        "scale_pos_weight": 1,                  # tune by class imbalance: (sum(negative instances) / sum(positive instances)                  
    }

space2 = {
        "n_estimators": scope.int(hp.uniform("n_estimators", 10, 300)),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.3, 0.01),
        "max_depth": scope.int(hp.quniform("max_depth", 1, 15, 1)),
        "min_child_weight": hp.quniform("min_child_weight", 1, 9, 1),
        "subsample": hp.quniform("subsample", 0.6, 1, 0.05),
        "gamma": hp.quniform("gamma", 0.05, 3, 0.05),
        "colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
        "colsample_bylevel": hp.quniform("colsample_bylevel", 0.4, 1, 0.05),
        "reg_lambda": hp.quniform("reg_lambda", 0.01, 2, 0.01),
        "reg_alpha": hp.quniform("reg_alpha", 0, 10, 1),
        #'monotone_constraints': mon_cons,
    }

def objective(params):
    tree = xgb.XGBClassifier(**params, random_state = 22)
    score = cross_val_score(tree, x_train, y_train, cv = KFold(4)).mean() # Aplicamos validación cruzada con 4 folds.
    return {'loss': 1 - score, 'status': STATUS_OK}




In [41]:
import warnings

# Configura para ignorar los warnings futuros
warnings.simplefilter(action='ignore', category=FutureWarning)

In [42]:
best = fmin(objective, space,
            algo = tpe.suggest,
            max_evals = 6,
            rstate = np.random.default_rng(22))

100%|██████████| 6/6 [02:55<00:00, 29.21s/trial, best loss: 0.08958238056328494]


In [43]:
best_params = space_eval(space, best)
#print("BEST PARAMS: ", best_params)
#0.08900150302033172
#0.08900150302033172
#BEST PARAMS con space1:  {'colsample_bylevel': 0.8219490559795931, 'colsample_bynode': 0.24759470484372925, 'colsample_bytree': 0.7182253566785832, 'eta': 0.10879724556393319, 'gamma': 0.013905544512424909, 'max_depth': 7, 'min_child_weight': 9.0, 'n_estimators': 160, 'scale_pos_weight': 1, 'subsample': 0.7282346960910877}
#BEST PARAMS con space2:  {'colsample_bylevel': 0.8, 'colsample_bytree': 0.9500000000000001, 'gamma': 2.7, 'learning_rate': 0.11, 'max_depth': 9, 'min_child_weight': 5.0, 'n_estimators': 102, 'reg_alpha': 9.0, 'reg_lambda': 1.19, 'subsample': 0.8500000000000001

In [44]:
clf = xgb.XGBClassifier(
    objective = 'binary:logistic',
    seed = 100,
    eval_metric = 'auc',
    **best_params)

clf.fit(x_train, y_train, verbose = True, eval_set = [(X_val, Y_val)])

[0]	validation_0-auc:0.82941
[1]	validation_0-auc:0.83940
[2]	validation_0-auc:0.84720
[3]	validation_0-auc:0.85834
[4]	validation_0-auc:0.85780
[5]	validation_0-auc:0.86196
[6]	validation_0-auc:0.86380
[7]	validation_0-auc:0.86680
[8]	validation_0-auc:0.86623
[9]	validation_0-auc:0.86719
[10]	validation_0-auc:0.86696
[11]	validation_0-auc:0.86782
[12]	validation_0-auc:0.86813
[13]	validation_0-auc:0.86866
[14]	validation_0-auc:0.86856
[15]	validation_0-auc:0.86878
[16]	validation_0-auc:0.86961
[17]	validation_0-auc:0.87027
[18]	validation_0-auc:0.87063
[19]	validation_0-auc:0.87089
[20]	validation_0-auc:0.87117
[21]	validation_0-auc:0.87186
[22]	validation_0-auc:0.87173
[23]	validation_0-auc:0.87129
[24]	validation_0-auc:0.87331
[25]	validation_0-auc:0.87390
[26]	validation_0-auc:0.87463
[27]	validation_0-auc:0.87484
[28]	validation_0-auc:0.87512
[29]	validation_0-auc:0.87582
[30]	validation_0-auc:0.87625
[31]	validation_0-auc:0.87665
[32]	validation_0-auc:0.87680
[33]	validation_0-au

In [45]:
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(clf, x_train, y_train, cv=cv, scoring="roc_auc")

In [46]:
scores.mean()


0.882818465610195

In [49]:
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': clf.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
print(feature_importance)

                              feature  importance
0                        is_pdp_False    0.267079
1                          is_pdp_nan    0.216792
2                              offset    0.055522
3                      print_position    0.048872
4            platform_/mobile/android    0.037975
5               platform_/web/desktop    0.029362
6            total_orders_item_30days    0.026761
7                         is_pdp_True    0.024271
8                total_si_item_30days    0.019042
9      avg_qty_orders_item_sel_30days    0.016710
10              total_gmv_item_30days    0.015239
11        logistic_type_not_specified    0.014504
12                  total_visits_item    0.012569
13               platform_/web/mobile    0.011566
14                             health    0.010719
15                               hour    0.010137
16                 available_quantity    0.009784
17           listing_type_id_gold_pro    0.008741
18                      free_shipping    0.008420


In [26]:
y_preds = clf.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/hyperopt_space2.csv", sep=",", index=False)