In [1]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, loguniform
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
import numpy as np
from hyperopt.pyll import scope
from sklearn.preprocessing import StandardScaler




In [2]:
data = pd.read_csv("./data/competition_data.csv")

data.drop(columns=['accepts_mercadopago', 'benefit', 'boosted', 'category_id', 'deal_print_id', 
                   'etl_version', 'domain_id','full_name','product_id', 'item_id', 'main_picture', 'date', 
                   'site_id', 'tags','title','uid', 'user_id', 'warranty'], axis='columns', inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
#data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [13]:
train_data = data[data["ROW_ID"].isna()]
print(train_data.shape)
test_data = data[data["ROW_ID"].notna()]
train_data = train_data[train_data["available_quantity"] < 50000]
print(train_data.shape)

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])

(180761, 53)
(178366, 53)


In [27]:
train_data['conversion'].value_counts()

conversion
False    161859
True      16507
Name: count, dtype: int64

In [14]:
space={
        "n_estimators": hp.choice("n_estimators",np.arange(32, 264, 8, dtype=int)),                   # tune 32 - 256
        "eta":hp.uniform("eta",0.01,0.9),                                   # learning rate # tune 0.01 - 0.9
        "gamma":hp.uniform("gamma",0.01,0.9),                               # tune 0 - 0.9
        "max_depth":hp.choice("max_depth", np.arange(3, 18, 1, dtype=int)),                       # tune 6 - 18
        "min_child_weight":hp.quniform('min_child_weight', 0, 10, 1),       # tune 0 - 10
        "subsample":hp.uniform("subsample",0.5,1),                          # tune 0.5 - 1
        "colsample_bytree":hp.uniform("colsample_bytree",0,1),              # tune 0- 1
        "colsample_bylevel":hp.uniform("colsample_bylevel",0,1),             # tune 0- 1
        "colsample_bynode":hp.uniform("colsample_bynode",0,1),              # tune 0- 1
        "scale_pos_weight": 1,                  # tune by class imbalance: (sum(negative instances) / sum(positive instances)                  
    }

space2 = {
        "n_estimators": scope.int(hp.uniform("n_estimators", 10, 300)),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.3, 0.01),
        "max_depth": scope.int(hp.quniform("max_depth", 1, 15, 1)),
        "min_child_weight": hp.quniform("min_child_weight", 1, 9, 1),
        "subsample": hp.quniform("subsample", 0.6, 1, 0.05),
        "gamma": hp.quniform("gamma", 0.05, 3, 0.05),
        "colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
        "colsample_bylevel": hp.quniform("colsample_bylevel", 0.4, 1, 0.05),
        "reg_lambda": hp.quniform("reg_lambda", 0.01, 2, 0.01),
        "reg_alpha": hp.quniform("reg_alpha", 0, 10, 1),
        #'monotone_constraints': mon_cons,
    }

def objective(params):
    tree = xgb.XGBClassifier(**params, random_state = 22)
    score = cross_val_score(tree, x_train, y_train, cv = KFold(4)).mean() # Aplicamos validación cruzada con 4 folds.
    return {'loss': 1 - score, 'status': STATUS_OK}




In [5]:
import warnings

# Configura para ignorar los warnings futuros
warnings.simplefilter(action='ignore', category=FutureWarning)

In [15]:
best = fmin(objective, space,
            algo = tpe.suggest,
            max_evals = 6,
            rstate = np.random.default_rng(22))

  0%|          | 0/6 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 6/6 [03:05<00:00, 30.84s/trial, best loss: 0.0894060575748643] 


In [16]:
best_params = space_eval(space, best)
#print("BEST PARAMS: ", best_params)
#0.08900150302033172
#0.08900150302033172
#BEST PARAMS con space1:  {'colsample_bylevel': 0.8219490559795931, 'colsample_bynode': 0.24759470484372925, 'colsample_bytree': 0.7182253566785832, 'eta': 0.10879724556393319, 'gamma': 0.013905544512424909, 'max_depth': 7, 'min_child_weight': 9.0, 'n_estimators': 160, 'scale_pos_weight': 1, 'subsample': 0.7282346960910877}
#BEST PARAMS con space2:  {'colsample_bylevel': 0.8, 'colsample_bytree': 0.9500000000000001, 'gamma': 2.7, 'learning_rate': 0.11, 'max_depth': 9, 'min_child_weight': 5.0, 'n_estimators': 102, 'reg_alpha': 9.0, 'reg_lambda': 1.19, 'subsample': 0.8500000000000001

In [17]:
clf = xgb.XGBClassifier(
    objective = 'binary:logistic',
    seed = 100,
    eval_metric = 'auc',
    **best_params)

clf.fit(x_train, y_train, verbose = True, eval_set = [(X_val, Y_val)])

[0]	validation_0-auc:0.84983
[1]	validation_0-auc:0.85540
[2]	validation_0-auc:0.85958
[3]	validation_0-auc:0.86540
[4]	validation_0-auc:0.86548
[5]	validation_0-auc:0.86674
[6]	validation_0-auc:0.86794
[7]	validation_0-auc:0.86723
[8]	validation_0-auc:0.86480
[9]	validation_0-auc:0.86166
[10]	validation_0-auc:0.86609
[11]	validation_0-auc:0.86293
[12]	validation_0-auc:0.86572
[13]	validation_0-auc:0.86790
[14]	validation_0-auc:0.87013
[15]	validation_0-auc:0.87121
[16]	validation_0-auc:0.87209
[17]	validation_0-auc:0.87224
[18]	validation_0-auc:0.87194
[19]	validation_0-auc:0.87263
[20]	validation_0-auc:0.87238
[21]	validation_0-auc:0.87311
[22]	validation_0-auc:0.87291
[23]	validation_0-auc:0.87362
[24]	validation_0-auc:0.87440
[25]	validation_0-auc:0.87462
[26]	validation_0-auc:0.87488
[27]	validation_0-auc:0.87509
[28]	validation_0-auc:0.87509
[29]	validation_0-auc:0.87493
[30]	validation_0-auc:0.87492
[31]	validation_0-auc:0.87477
[32]	validation_0-auc:0.87573
[33]	validation_0-au

In [21]:
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(clf, x_train, y_train, cv=cv, scoring="roc_auc")

In [20]:
scores.mean()


0.882803729008408

In [23]:
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': clf.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
print(feature_importance)

                              feature  importance
0                        is_pdp_False    0.276965
1                          is_pdp_nan    0.194493
2                              offset    0.080612
3                         is_pdp_True    0.046829
4                      print_position    0.043457
5               platform_/web/desktop    0.030856
6                   total_visits_item    0.026178
7            total_orders_item_30days    0.025281
8            platform_/mobile/android    0.023896
9                total_si_item_30days    0.014980
10                 available_quantity    0.010744
11       listing_type_id_gold_special    0.010321
12     avg_qty_orders_item_sel_30days    0.009774
13                               hour    0.009347
14                      free_shipping    0.007944
15                              price    0.007811
16              total_gmv_item_30days    0.007523
17         total_orders_domain_30days    0.006879
18          logistic_type_xd_drop_off    0.006621


In [24]:
y_preds = clf.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/hyperopt_sin_avail50.csv", sep=",", index=False)