In [40]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

In [42]:
data = pd.read_csv("./data/competition_data.csv")

data.drop(columns=['benefit', 'category_id', 'deal_print_id','etl_version', 'full_name','product_id'
                   , 'item_id', 'main_picture', 'site_id', 'uid', 'user_id', 'title', 'tags', 'warranty'],
                     axis= "columns", inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
#data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp", 'date', "domain_id"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [49]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])
y_test = test_data["conversion"]

In [44]:
#search space for hyperopt HistGradientBoostingClassifier()
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'max_iter': hp.choice('max_iter', np.arange(100, 500, 10, dtype=int)),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(10, 100, 10, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(10, 100, 10, dtype=int)),
    'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 10, 1, dtype=int)),
    'l2_regularization': hp.uniform('l2_regularization', 0.01, 0.5),
    'max_bins': hp.choice('max_bins', np.arange(10, 100, 10, dtype=int)),
    'validation_fraction': hp.uniform('validation_fraction', 0.01, 0.5),
    'n_iter_no_change': hp.choice('n_iter_no_change', np.arange(1, 10, 1, dtype=int)),
    #'tol': hp.uniform('tol', 0.0001, 0.001),
    #'scoring': hp.choice('scoring', ['loss', 'accuracy', 'balanced_accuracy', 'average_precision', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted', 'f1_samples', 'neg_log_loss', 'precision', 'recall', 'roc_auc']),
    #'random_state': hp.choice('random_state', np.arange(1, 100, 1, dtype=int)),
    'warm_start': hp.choice('warm_start', [True, False]),
    'early_stopping': hp.choice('early_stopping', [True, False]),
    #'verbose': hp.choice('verbose', [True, False])

    

}

def objective(params):
    tree = HistGradientBoostingClassifier(**params, random_state = 22, scoring="roc_auc")
    score = cross_val_score(tree, x_train, y_train, cv = KFold(4)).mean() # Aplicamos validación cruzada con 4 folds.
    return {'loss': 1 - score, 'status': STATUS_OK}

In [46]:
best = fmin(objective, space = space,
            algo = tpe.suggest,
            max_evals = 3,
            rstate = np.random.default_rng(22))

100%|██████████| 3/3 [00:58<00:00, 19.42s/trial, best loss: 0.0887580864576425] 


In [47]:
best_params = space_eval(space, best)
print("BEST PARAMS: ", best_params)


BEST PARAMS:  {'early_stopping': False, 'l2_regularization': 0.35979456172848456, 'learning_rate': 0.035719415867667643, 'max_bins': 10, 'max_depth': 60, 'max_iter': 260, 'max_leaf_nodes': 60, 'min_samples_leaf': 9, 'n_iter_no_change': 4, 'validation_fraction': 0.4578017029077751, 'warm_start': False}


In [57]:
hist_gradient = HistGradientBoostingClassifier(**best_params,scoring="roc_auc")
hist_gradient.fit(X_train, Y_train)
print(hist_gradient.score(X_val, Y_val))
print(roc_auc_score(Y_val, hist_gradient.predict_proba(X_val)[:, hist_gradient.classes_== 1]))


0.9110170663568722
0.8925772210844479


In [58]:
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(hist_gradient, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores)

[0.88701696 0.88943346 0.88824374 0.88596927 0.88562665]


In [59]:
scores.mean()

0.8872580170279765

In [55]:
hist_gradient.get_params(deep=True)

{'categorical_features': None,
 'class_weight': None,
 'early_stopping': False,
 'interaction_cst': None,
 'l2_regularization': 0.35979456172848456,
 'learning_rate': 0.035719415867667643,
 'loss': 'log_loss',
 'max_bins': 10,
 'max_depth': 60,
 'max_iter': 260,
 'max_leaf_nodes': 60,
 'min_samples_leaf': 9,
 'monotonic_cst': None,
 'n_iter_no_change': 4,
 'random_state': None,
 'scoring': 'roc_auc',
 'tol': 1e-07,
 'validation_fraction': 0.4578017029077751,
 'verbose': 0,
 'warm_start': False}

In [56]:
y_preds = hist_gradient.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, hist_gradient.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/hist_gradient2.csv", sep=",", index=False)