In [1]:
import pandas as pd
import numpy as np

# Models
import xgboost as xgb
from sklearn.linear_model import LogisticRegression

# Processing and metrics
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, TruncatedSVD

# Optimization
from scipy.stats import uniform, loguniform
from hyperopt import fmin, tpe, hp, STATUS_OK, space_eval
from hyperopt.pyll import scope

# NLP
import gensim
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import nltk
import re

In [2]:
data = pd.read_csv("./data/competition_data.csv")

# Procesamiento de datos
Se realiza una selección de variables por filtering. Luego se realizan las conversiones de datos necesarias. 

TODO:
- full_name
- tags
- title

In [3]:
data.drop(columns=['benefit', 'deal_print_id','etl_version', 'full_name', 'tags',
                   'item_id', 'main_picture', 'product_id', 'date',
                   'site_id','uid', 'user_id', 'category_id', 'domain_id'], axis= "columns", inplace=True)

data['warranty'] = data['warranty'] != 'Sin garantía'
data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"], dummy_na = False, dtype = int)
data = pd.get_dummies(data, columns = ["is_pdp"], dummy_na = True, dtype = int)

#### Procesamiento de columna 'title' con w2v

In [4]:
def tokenizer(raw_text):
    """
    Tokeniza y preprocesa un texto.

    Args:
        raw_text (str): Texto sin procesar.

    Returns:
        list: Lista de oraciones, donde cada oración es una lista de palabras.
    """
    sentences = sent_tokenize(raw_text)
    sentences = [word_tokenize(e) for e in sentences]
    sentences = [[e2 for e2 in e1 if re.compile("[A-Za-z]").search(e2[0])] for e1 in sentences]
    sentences = [[e2.lower() for e2 in e1] for e1 in sentences]
    return(sentences)

def average_vectors(title_tokens, model, stopwords=None):
    """
    Calcula el vector promedio de un conjunto de tokens utilizando un modelo Word2Vec.

    Args:
        title_tokens (list): Lista de tokens.
        model (gensim.models.Word2Vec): Modelo Word2Vec.
        stopwords (set, optional): Conjunto de palabras stopwords. Defaults to None.

    Returns:
        numpy.ndarray: Vector promedio.
    """
    title_tokens = [e2 for e1 in title_tokens for e2 in e1]
    title_tokens = [e for e in title_tokens if e in model.wv]
    if stopwords is not None:
        title_tokens = [e for e in title_tokens if e not in stopwords]
    if len(title_tokens) == 0:
        output = np.zeros(model.wv.vector_size)
    else:
        output = np.array([model.wv.get_vector(e) for e in title_tokens]).mean(0)
    return output

def PCA_K1(dataframe):
    scaler = StandardScaler(with_std=True, with_mean=True)
    pca = PCA()
    pca.fit(scaler.fit_transform(dataframe))
    return pca

def PCA_TRANSFORM(title_tokens, pca):
    return pca.transform(title_tokens)

In [5]:
STOP_WORDS_SP = set(stopwords.words('spanish'))

data["title_tokens"] = data["title"].map(tokenizer)

# Creación del modelo Word2Vec
w2v_tp = gensim.models.Word2Vec(vector_size=150,
                                window=3,
                                min_count=5,
                                negative=10,
                                sample=0.01,
                                workers=8,
                                sg=1)

# Creación del vocabulario a partir del corpus
w2v_tp.build_vocab([e2 for e1 in data["title_tokens"].values for e2 in e1],
                   progress_per=10000)

# Entrenamiento del modelo Word2Vec
w2v_tp.train([e2 for e1 in data["title_tokens"].values for e2 in e1],
             total_examples=w2v_tp.corpus_count,
             epochs=50, report_delay=1)

(74029810, 74992250)

In [6]:
title_embs = data["title_tokens"].map(lambda x: average_vectors(x, w2v_tp, STOP_WORDS_SP))
title_embs = np.array(title_embs.to_list())
data = pd.concat([data, pd.DataFrame(title_embs)], axis=1)

In [7]:
data.drop(columns=["title", "title_tokens"], axis= "columns", inplace=True)

# Entrenamiento del modelo

In [8]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])

In [9]:
params = {'colsample_bylevel': 0.8219490559795931, 'colsample_bynode': 0.24759470484372925, 'colsample_bytree': 0.7182253566785832, 'eta': 0.10879724556393319, 'gamma': 0.013905544512424909, 'max_depth': 7, 'min_child_weight': 9.0, 'n_estimators': 160, 'scale_pos_weight': 1, 'subsample': 0.7282346960910877}

In [10]:
clf = xgb.XGBClassifier(
    objective = 'binary:logistic',
    seed = 100,
    eval_metric = 'auc',
    **params)
clf.fit(X_train, Y_train, verbose = True, eval_set = [(X_val, Y_val)])

[0]	validation_0-auc:0.86347
[1]	validation_0-auc:0.87033
[2]	validation_0-auc:0.86600
[3]	validation_0-auc:0.85524
[4]	validation_0-auc:0.86675
[5]	validation_0-auc:0.87244
[6]	validation_0-auc:0.87506
[7]	validation_0-auc:0.87595
[8]	validation_0-auc:0.87764
[9]	validation_0-auc:0.87804
[10]	validation_0-auc:0.87887
[11]	validation_0-auc:0.87936
[12]	validation_0-auc:0.88010
[13]	validation_0-auc:0.88011
[14]	validation_0-auc:0.88036
[15]	validation_0-auc:0.88112
[16]	validation_0-auc:0.88139
[17]	validation_0-auc:0.88158
[18]	validation_0-auc:0.88184
[19]	validation_0-auc:0.88211
[20]	validation_0-auc:0.88238
[21]	validation_0-auc:0.88234
[22]	validation_0-auc:0.88263
[23]	validation_0-auc:0.88279
[24]	validation_0-auc:0.88289
[25]	validation_0-auc:0.88320
[26]	validation_0-auc:0.88352
[27]	validation_0-auc:0.88364
[28]	validation_0-auc:0.88378
[29]	validation_0-auc:0.88433
[30]	validation_0-auc:0.88484
[31]	validation_0-auc:0.88497
[32]	validation_0-auc:0.88508
[33]	validation_0-au

In [11]:
y_preds = clf.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/w2v.csv", sep=",", index=False)

In [67]:
feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': clf.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
print(feature_importance)

                          feature  importance
0                      is_pdp_nan    0.311375
1                    is_pdp_False    0.247014
2                          offset    0.028327
3           platform_/web/desktop    0.022255
4                     is_pdp_True    0.018905
..                            ...         ...
149                       boosted    0.000000
150     logistic_type_xd_drop_off    0.000000
151         logistic_type_default    0.000000
152  listing_type_id_gold_special    0.000000
153           accepts_mercadopago    0.000000

[154 rows x 2 columns]
