In [12]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

Todas las variables


accepts_mercadopago Whether the item accepts Mercado Pago. Todas aceptan, la saco
available_quantity The available stock quantity at that moment
avg_gmv_item_domain_30days Average revenue generated by the items of this domain on the last month
avg_gmv_item_sel Average revenue of items of this seller
avg_gmv_seller_bday Average revenue this seller makes by day
avg_qty_orders_item_domain_30days Average number of orders a random item of this domain made on the last month
avg_qty_orders_item_sel_30days Average number of orders an item of this seller makes on the last 30 days
avg_si_item_sel_30day Average units sold of an item of this seller on the past month
benefit Ignore, should be dropped
boosted Whether the item was boosted
category_id Category of this item
conversion Target variable, it is True if this print has an attributed order
date Print date
deal_print_id Unique id for the print
domain_id Domain id for the item
etl_version Ignore, should be dropped
free_shipping Whether the item has free shipping
fulfillment Whether the item is fulfilled by MeLi
full_name Category full name
health Item health
is_pdp Whether the click landed on a PDP
product_id Product_id of the item
item_id ID of the item, useful for debugging
listing_type_id Whether the item is gold or not
logistic_type Logistic type for the item
main_picture URL for the main item picture
offset On which page the item was rendered
original_price Price from which the discount was done
platform Which platform the user is using
price Item price
print_position Position on the page
print_server_timestamp Timestamp for the print
qty_items_dom Number of items this domain has
qty_items_sel Number of items the seller has
rn Leftover from the ETL, Discard
ROW_ID Row of the submission file
site_id Site ID
sold_quantity Number of items sold at the moment of the print
tags Tags the item had at the moment of the print
title Item title
total_asp_item_domain_30days Average selling price of the items of the domain
total_asp_item_sel_30days Average selling price of all the items the seller sold on the last 30 days
total_gmv_domain_bday total_gmv_domain_30days / 30       Total revenue the domain made on the last 30 days
total_gmv_item_30days Total revenue made by the item on the lasts 30 days
total_items_domain Number of items on the domain
total_items_seller Number of items the seller has
total_orders_domain_30days Total orders on the domain
total_orders_item_30days Total orders the Item had on the last 30 days
total_orders_sel_30days Total orders for the seller
total_si_domain_30days Total units sold of this domain
total_si_item_30days Total units sold of this item
total_si_sel_30days Same for the seller
total_visits_domain Total visits on this domain
total_visits_item Total visits this item had
total_visits_seller Total visits for this seller
uid session id
user_id user id
warranty Whether the item had warranty
conversion should be predicted for those rows where ROW_ID is not missing.

In [48]:
data = pd.read_csv("./data/competition_data.csv")
data.drop(columns=['benefit', 'category_id', 'deal_print_id','etl_version', 'full_name','product_id'
                   , 'item_id', 'main_picture', 'site_id', 'uid', 'user_id', 'title', 'tags', 'warranty'],
                     axis= "columns", inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["date"] = pd.to_datetime(data["date"])
data["hour"] = data["date"].dt.hour
data["day"] = data["date"].dt.day
#data["minute"] = data["date"].dt.minute
data["month"] = data["date"].dt.month
data.drop(columns=["print_server_timestamp", 'date', "domain_id"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [49]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])


In [44]:
#search space for hyperopt HistGradientBoostingClassifier()
space = {
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.5),
    'max_iter': hp.choice('max_iter', np.arange(100, 500, 10, dtype=int)),
    'max_leaf_nodes': hp.choice('max_leaf_nodes', np.arange(10, 100, 10, dtype=int)),
    'max_depth': hp.choice('max_depth', np.arange(10, 100, 10, dtype=int)),
    'min_samples_leaf': hp.choice('min_samples_leaf', np.arange(1, 10, 1, dtype=int)),
    'l2_regularization': hp.uniform('l2_regularization', 0.01, 0.5),
    'max_bins': hp.choice('max_bins', np.arange(10, 100, 10, dtype=int)),
    'validation_fraction': hp.uniform('validation_fraction', 0.01, 0.5),
    'n_iter_no_change': hp.choice('n_iter_no_change', np.arange(1, 10, 1, dtype=int)),
    #'tol': hp.uniform('tol', 0.0001, 0.001),
    #'scoring': hp.choice('scoring', ['loss', 'accuracy', 'balanced_accuracy', 'average_precision', 'f1', 'f1_micro', 'f1_macro', 'f1_weighted', 'f1_samples', 'neg_log_loss', 'precision', 'recall', 'roc_auc']),
    #'random_state': hp.choice('random_state', np.arange(1, 100, 1, dtype=int)),
    'warm_start': hp.choice('warm_start', [True, False]),
    'early_stopping': hp.choice('early_stopping', [True, False]),
    #'verbose': hp.choice('verbose', [True, False])

    

}

def objective(params):
    tree = HistGradientBoostingClassifier(**params, random_state = 22, scoring="roc_auc")
    score = cross_val_score(tree, x_train, y_train, cv = KFold(4)).mean() # Aplicamos validación cruzada con 4 folds.
    return {'loss': 1 - score, 'status': STATUS_OK}

In [46]:
best = fmin(objective, space = space,
            algo = tpe.suggest,
            max_evals = 5,
            rstate = np.random.default_rng(22))

100%|██████████| 3/3 [00:58<00:00, 19.42s/trial, best loss: 0.0887580864576425] 


In [47]:
best_params = space_eval(space, best)
print("BEST PARAMS: ", best_params)


BEST PARAMS:  {'early_stopping': False, 'l2_regularization': 0.35979456172848456, 'learning_rate': 0.035719415867667643, 'max_bins': 10, 'max_depth': 60, 'max_iter': 260, 'max_leaf_nodes': 60, 'min_samples_leaf': 9, 'n_iter_no_change': 4, 'validation_fraction': 0.4578017029077751, 'warm_start': False}


In [57]:
hist_gradient = HistGradientBoostingClassifier(**best_params,scoring="roc_auc")
hist_gradient.fit(X_train, Y_train)
print(hist_gradient.score(X_val, Y_val))
print(roc_auc_score(Y_val, hist_gradient.predict_proba(X_val)[:, hist_gradient.classes_== 1]))


0.9110170663568722
0.8925772210844479


In [58]:
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(hist_gradient, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores)

[0.88701696 0.88943346 0.88824374 0.88596927 0.88562665]


In [56]:
y_preds = hist_gradient.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, hist_gradient.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/hist_gradient2.csv", sep=",", index=False)