In [1]:
import pandas as pd
import numpy as np

# Models
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

# Processing and metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.decomposition import PCA, TruncatedSVD

# Optimization
from scipy.stats import uniform, loguniform
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
from hyperopt.pyll import scope

# NLP
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import re

In [2]:
data = pd.read_csv("./data/competition_data.csv")

# Procesamiento de datos
Se realiza una selección de variables por filtering. Luego se realizan las conversiones de datos necesarias.

TODO:
- Agregar PolynomialFeatures?

#### Filtering e ingenieria de atributos

In [3]:
data.drop(columns=['benefit', 'deal_print_id','etl_version',
                   'item_id', 'main_picture', 'product_id', 'date',
                   'site_id','uid', 'user_id', 'category_id', 'domain_id'], axis= "columns", inplace=True)

Extraemos la cantidad de dias de *warranty* y reemplazamos columna con esos valores

In [4]:
def extract_warr_days(text):
    if len(text) > 80:
        return 0
    matchD = re.search(r'(\d+)\s+d(i|í)as', text)
    matchM = re.search(r'(\d+)\s+m(e|é)ses', text)
    matchY = re.search(r'(\d+)\s+a(n|ñ)os', text)
    if matchD:
        return int(matchD.group(1))
    elif matchM:
        return int(matchM.group(1))*30
    elif matchY:
        return int(matchY.group(1))*365
    else:
        return None
    
data['warranty'] = data['warranty'].fillna("Sin garantia").apply(lambda x: extract_warr_days(x)).fillna(0)

Convertimos la columna de conversiones a bool. Separamos la fecha en diferente columnas.

In [5]:
# Y casting to bool
data['conversion'] = data['conversion'].astype('bool')

# Date processing
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
data["month"] = data["print_server_timestamp"].dt.month
data["day_of_week"] = data["print_server_timestamp"].dt.dayofweek
data['week_of_month'] = data['print_server_timestamp'].apply(lambda d: (d.day-1) // 7 + 1)
data.drop(columns=["print_server_timestamp"], axis= "columns", inplace=True)

Transformamos las variables categoricas con OHE. Para la variable *is_pdp*, agregamos un encoding para los valores faltantes.

In [6]:
data = pd.get_dummies(data, columns = ["listing_type_id", "logistic_type", "platform"], dummy_na = False, dtype = int)
data = pd.get_dummies(data, columns = ["is_pdp"], dummy_na = True, dtype = int)

La columna tags posee una lista de tags. creamos una columna para cada uno de los 18 tags que existe, y codificamos segun los tags presente en cada observacion

In [7]:
data['tags'] = data['tags'].str.strip('][').str.split(', ')
tags = set()
for i in data['tags']:
    for tag in i:
        tags.add("tag_" + tag)
tags = list(tags)

for tag in tags:
    data[tag] = data['tags'].apply(lambda x: 1 if tag in x else 0)

data = data.drop(columns=['tags'], axis= "columns")

La columna full_name posee una cadena de la categoria y subcategorias en las que se encuentra el producto. Extramos la primera de las categorias (mas amplia) y hacemos OHE.

In [8]:
def extract_first_name(x):
    names = x.split("->")
    if len(names) > 1:
        return names[0][:-1]
    else:
        return None

data['category_name'] = data['full_name'].apply(lambda x: extract_first_name(x))
data = data.drop(columns=['full_name'], axis= "columns")
data = pd.get_dummies(data, columns = ["category_name"], dummy_na = True, dtype = int)

Hacemos polynomial features solo con interacciones con algunas de las columnas de int (las que parecian con algo de sentido)

In [15]:
pf = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
int_cols = data.select_dtypes(include=['int']).columns
pf.fit(data[int_cols[:11]])
pf_data = pf.transform(data[int_cols[:11]])
pf_data = pd.DataFrame(pf_data, columns=pf.get_feature_names_out(int_cols[:11]))
data = pd.concat((data, pf_data.iloc[:, 11:]), axis=1)

#### Procesamiento de columna 'title' con w2v

In [16]:
def tokenizer(raw_text):
    """
    Tokeniza y preprocesa un texto.

    Args:
        raw_text (str): Texto sin procesar.

    Returns:
        list: Lista de oraciones, donde cada oración es una lista de palabras.
    """
    sentences = sent_tokenize(raw_text)
    sentences = [word_tokenize(e) for e in sentences]
    sentences = [[e2 for e2 in e1 if re.compile("[A-Za-z]").search(e2[0])] for e1 in sentences]
    sentences = [[e2.lower() for e2 in e1] for e1 in sentences]
    return(sentences)

def average_vectors(title_tokens, model, stopwords=None):
    """
    Calcula el vector promedio de un conjunto de tokens utilizando un modelo Word2Vec.

    Args:
        title_tokens (list): Lista de tokens.
        model (gensim.models.Word2Vec): Modelo Word2Vec.
        stopwords (set, optional): Conjunto de palabras stopwords. Defaults to None.

    Returns:
        numpy.ndarray: Vector promedio.
    """
    title_tokens = [e2 for e1 in title_tokens for e2 in e1]
    title_tokens = [e for e in title_tokens if e in model.wv]
    if stopwords is not None:
        title_tokens = [e for e in title_tokens if e not in stopwords]
    if len(title_tokens) == 0:
        output = np.zeros(model.wv.vector_size)
    else:
        output = np.array([model.wv.get_vector(e) for e in title_tokens]).mean(0)
    return output

def PCA_K1(dataframe):
    scaler = StandardScaler(with_std=True, with_mean=True)
    pca = PCA()
    pca.fit(scaler.fit_transform(dataframe))
    return pca


In [18]:
STOP_WORDS_SP = set(stopwords.words('spanish'))
data["title_tokens"] = data["title"].map(tokenizer)

# Creación del modelo Word2Vec
w2v_tp = gensim.models.Word2Vec(vector_size=30,
                                window=3,
                                min_count=5,
                                negative=10,
                                sample=0.01,
                                workers=8,
                                sg=1)

# Creación del vocabulario a partir del corpus
w2v_tp.build_vocab([e2 for e1 in data["title_tokens"].values for e2 in e1],
                   progress_per=10000)

# Entrenamiento del modelo Word2Vec
w2v_tp.train([e2 for e1 in data["title_tokens"].values for e2 in e1],
             total_examples=w2v_tp.corpus_count,
             epochs=50, report_delay=1)

(74029964, 74992250)

In [20]:
title_embs = data["title_tokens"].map(lambda x: average_vectors(x, w2v_tp, STOP_WORDS_SP))
title_embs = np.array(title_embs.to_list())
data = pd.concat([data, pd.DataFrame(title_embs, columns=[f'emb_{i}' for i in range(30)])], axis=1)

In [21]:
data.drop(columns=["title", "title_tokens"], axis= "columns", inplace=True)

In [22]:
data.to_csv("./data/competition_data_ready.csv", index=False)

# Entrenamiento del modelo

In [2]:
data = pd.read_csv("./data/competition_data_ready.csv")

train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

train_data.columns = train_data.columns.astype(str)
test_data.columns = test_data.columns.astype(str)

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]

scaler = StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)

X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion", "ROW_ID"])

X_test = scaler.transform(X_test)

Se crea un conjunto más pequeño para hacer optimización de hiperparametros

In [23]:
space2 = {
        "n_estimators": scope.int(hp.uniform("n_estimators", 10, 1000)),
        "learning_rate": hp.loguniform('learning_rate', -5, 0),
        "max_depth": scope.int(hp.quniform('max_depth', 2, 10, 1)),
        "min_child_weight": hp.quniform("min_child_weight", 1, 10, 1),
        "subsample": hp.uniform('subsample', 0.1, 1),
        "gamma": hp.quniform("gamma", 0.05, 3, 0.05),
        "colsample_bytree": hp.quniform("colsample_bytree", 0.4, 1, 0.05),
        "reg_lambda": hp.quniform("reg_lambda", 0.01, 2, 0.01),
        "reg_alpha": hp.quniform("reg_alpha", 0, 10, 1),
    }

X_train_min = X_train[:10000]
Y_train_min = Y_train[:10000]

def objective(params):
    tree = xgb.XGBClassifier(**params, random_state = 22)
    score = cross_val_score(tree, X_train_min, Y_train_min, cv = KFold(4)).mean() # Aplicamos validación cruzada con 4 folds.
    return {'loss': 1 - score, 'status': STATUS_OK}


In [26]:
best = fmin(objective, space = space2,
            algo = tpe.suggest,
            max_evals = 10,
            rstate = np.random.default_rng(22))

  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

 78%|███████▊  | 39/50 [31:52<09:59, 54.51s/trial, best loss: 0.09099999999999997]

In [14]:
best_params = space_eval(space2, best)
print("BEST PARAMS: ", best_params)

BEST PARAMS:  {'early_stopping': True, 'l2_regularization': 0.23693924419361373, 'learning_rate': 0.03722951048080424, 'max_bins': 20, 'max_depth': 11, 'max_leaf_nodes': 10, 'min_samples_leaf': 55, 'validation_fraction': 0.13753150359451294, 'warm_start': False}


In [3]:
best_params_found = {'early_stopping': False, 'l2_regularization': 0.35979456172848456, 'learning_rate': 0.035719415867667643, 'max_bins': 10, 'max_depth': 60, 'max_iter': 260, 'max_leaf_nodes': 60, 'min_samples_leaf': 9, 'n_iter_no_change': 4, 'validation_fraction': 0.4578017029077751, 'warm_start': False}

clf = xgb.XGBClassifier(**best_params_found, random_state = 22)
clf.fit(X_train, Y_train)
print(clf.score(X_val, Y_val))
print(roc_auc_score(Y_val, clf.predict_proba(X_val)[:, clf.classes_== 1]))

0.9111000470223771
0.8954204202490732


In [6]:
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(clf, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores.mean())

0.8909319935679475


In [21]:
clf.fit(x_train, y_train)

In [24]:
y_preds = clf.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/w2v.csv", sep=",", index=False)



In [67]:
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': clf.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
print(feature_importance)

                          feature  importance
0                      is_pdp_nan    0.311375
1                    is_pdp_False    0.247014
2                          offset    0.028327
3           platform_/web/desktop    0.022255
4                     is_pdp_True    0.018905
..                            ...         ...
149                       boosted    0.000000
150     logistic_type_xd_drop_off    0.000000
151         logistic_type_default    0.000000
152  listing_type_id_gold_special    0.000000
153           accepts_mercadopago    0.000000

[154 rows x 2 columns]
