In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, loguniform
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
import numpy as np
from hyperopt.pyll import scope
from sklearn.preprocessing import StandardScaler




In [2]:
data = pd.read_csv("./data/competition_data.csv")

data.drop(columns=['accepts_mercadopago', 'benefit', 'boosted', 'category_id', 'deal_print_id', 
                   'etl_version', 'domain_id','full_name','product_id', 'item_id', 'main_picture', 'date', 
                   'site_id', 'tags','title','uid', 'user_id', 'warranty'], axis='columns', inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
#data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [3]:
train_data = data[data["ROW_ID"].isna()]
print(train_data.shape)
test_data = data[data["ROW_ID"].notna()]
# train_data = train_data[train_data["available_quantity"] < 50000]
# print(train_data.shape)

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])

(180761, 53)


In [4]:
train_data['conversion'].value_counts()

conversion
False    164017
True      16744
Name: count, dtype: int64

In [5]:
space={
        "n_estimators": hp.choice("n_estimators",np.arange(32, 264, 8, dtype=int)),                   # tune 32 - 256
        "eta":hp.uniform("eta",0.01,0.9),                                   # learning rate # tune 0.01 - 0.9
        "gamma":hp.uniform("gamma",0.01,0.9),                               # tune 0 - 0.9
        "max_depth":hp.choice("max_depth", np.arange(3, 18, 1, dtype=int)),                       # tune 6 - 18
        "min_child_weight":hp.quniform('min_child_weight', 0, 10, 1),       # tune 0 - 10
        "subsample":hp.uniform("subsample",0.5,1),                          # tune 0.5 - 1
        "colsample_bytree":hp.uniform("colsample_bytree",0,1),              # tune 0- 1
        "colsample_bylevel":hp.uniform("colsample_bylevel",0,1),             # tune 0- 1
        "colsample_bynode":hp.uniform("colsample_bynode",0,1),              # tune 0- 1
        "scale_pos_weight": 1,                  # tune by class imbalance: (sum(negative instances) / sum(positive instances)                  
    }

space2 = {
        "n_estimators": scope.int(hp.uniform("n_estimators", 10, 300)),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.3, 0.01),
        "max_depth": scope.int(hp.quniform("max_depth", 1, 15, 1)),
        "min_child_weight": hp.quniform("min_child_weight", 1, 9, 1),
        "subsample": hp.quniform("subsample", 0.6, 1, 0.05),
        "gamma": hp.quniform("gamma", 0.05, 3, 0.05),
        "colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
        "colsample_bylevel": hp.quniform("colsample_bylevel", 0.4, 1, 0.05),
        "reg_lambda": hp.quniform("reg_lambda", 0.01, 2, 0.01),
        "reg_alpha": hp.quniform("reg_alpha", 0, 10, 1),
        #'monotone_constraints': mon_cons,
    }

def objective(params):
    tree = xgb.XGBClassifier(**params, random_state = 22)
    score = cross_val_score(tree, x_train, y_train, cv = KFold(4)).mean() # Aplicamos validación cruzada con 4 folds.
    return {'loss': 1 - score, 'status': STATUS_OK}




In [6]:
import warnings

# Configura para ignorar los warnings futuros
warnings.simplefilter(action='ignore', category=FutureWarning)

In [7]:
best = fmin(objective, space,
            algo = tpe.suggest,
            max_evals = 6,
            rstate = np.random.default_rng(22))

100%|██████████| 6/6 [02:45<00:00, 27.63s/trial, best loss: 0.08972621622637633]


In [8]:
best_params = space_eval(space, best)
#print("BEST PARAMS: ", best_params)
#0.08900150302033172
#0.08900150302033172
#BEST PARAMS con space1:  {'colsample_bylevel': 0.8219490559795931, 'colsample_bynode': 0.24759470484372925, 'colsample_bytree': 0.7182253566785832, 'eta': 0.10879724556393319, 'gamma': 0.013905544512424909, 'max_depth': 7, 'min_child_weight': 9.0, 'n_estimators': 160, 'scale_pos_weight': 1, 'subsample': 0.7282346960910877}
#BEST PARAMS con space2:  {'colsample_bylevel': 0.8, 'colsample_bytree': 0.9500000000000001, 'gamma': 2.7, 'learning_rate': 0.11, 'max_depth': 9, 'min_child_weight': 5.0, 'n_estimators': 102, 'reg_alpha': 9.0, 'reg_lambda': 1.19, 'subsample': 0.8500000000000001

In [9]:
clf = xgb.XGBClassifier(
    objective = 'binary:logistic',
    seed = 100,
    eval_metric = 'auc',
    **best_params)

clf.fit(X_train, Y_train, verbose = True, eval_set = [(X_val, Y_val)])

[0]	validation_0-auc:0.84715
[1]	validation_0-auc:0.86263
[2]	validation_0-auc:0.86213
[3]	validation_0-auc:0.86318
[4]	validation_0-auc:0.86659
[5]	validation_0-auc:0.86701
[6]	validation_0-auc:0.86785
[7]	validation_0-auc:0.86751
[8]	validation_0-auc:0.86754
[9]	validation_0-auc:0.86796
[10]	validation_0-auc:0.86841
[11]	validation_0-auc:0.86863
[12]	validation_0-auc:0.86838
[13]	validation_0-auc:0.86835
[14]	validation_0-auc:0.86811
[15]	validation_0-auc:0.86891
[16]	validation_0-auc:0.86949
[17]	validation_0-auc:0.86933
[18]	validation_0-auc:0.86969
[19]	validation_0-auc:0.86973
[20]	validation_0-auc:0.87086
[21]	validation_0-auc:0.87123
[22]	validation_0-auc:0.87132
[23]	validation_0-auc:0.87127
[24]	validation_0-auc:0.87119
[25]	validation_0-auc:0.87196
[26]	validation_0-auc:0.87209
[27]	validation_0-auc:0.87223
[28]	validation_0-auc:0.87245
[29]	validation_0-auc:0.87250
[30]	validation_0-auc:0.87270
[31]	validation_0-auc:0.87306
[32]	validation_0-auc:0.87365
[33]	validation_0-au

In [10]:
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(clf, x_train, y_train, cv=cv, scoring="roc_auc")

In [11]:
scores.mean()


0.8827837412033956

In [12]:
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': clf.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
print(feature_importance)

                              feature  importance
0                        is_pdp_False    0.238103
1                          is_pdp_nan    0.166806
2                              offset    0.066819
3                      print_position    0.053885
4                         is_pdp_True    0.039848
5               platform_/web/desktop    0.039093
6                   total_visits_item    0.025440
7            platform_/mobile/android    0.024221
8                total_si_item_30days    0.023642
9            total_orders_item_30days    0.021653
10     avg_qty_orders_item_sel_30days    0.018037
11              total_gmv_item_30days    0.012763
12               platform_/web/mobile    0.011127
13                               hour    0.010784
14         avg_gmv_item_domain_30days    0.010773
15                             health    0.010663
16                 available_quantity    0.010524
17  avg_qty_orders_item_domain_30days    0.010313
18           listing_type_id_gold_pro    0.010095


In [14]:
clf_completo = xgb.XGBClassifier(
    objective = 'binary:logistic',
    seed = 100,
    eval_metric = 'auc',
    **best_params)

clf_completo.fit(x_train, y_train)

In [15]:
y_preds = clf_completo.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf_completo.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/hyperopt_completo.csv", sep=",", index=False)