In [2]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,  HistGradientBoostingClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from tqdm import tqdm
from sklearn.metrics import roc_auc_score



In [4]:
data = pd.read_csv("./data/competition_data.csv")

data.drop(columns=['benefit', 'category_id', 'deal_print_id','etl_version', 'full_name','product_id'
                   , 'item_id', 'main_picture', 'site_id', 'uid', 'user_id', 'title', 'tags', 'warranty'],
                     axis= "columns", inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
#data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp", 'date', "domain_id"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [5]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])
#y_test = test_data["conversion"]

In [8]:
clf_sgd = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    SGDClassifier(max_iter=1000, tol=1e-3, loss="log_loss", random_state=42))
#cv = KFold(n_splits=5, random_state=0, shuffle=True)
#scores = cross_val_score(clf_sgd, x_train, y_train, cv=cv, scoring="roc_auc")
#print(scores, scores.mean())
clf_sgd.fit(X_train, Y_train)
y_preds_clf_sgd = clf_sgd.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf_sgd.classes_ == 1].squeeze()

In [9]:
knn = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    KNeighborsClassifier(n_neighbors=50, weights="uniform"))
# cv = KFold(n_splits=5, random_state=0, shuffle=True)
# scores = cross_val_score(knn, x_train, y_train, cv=cv, scoring="roc_auc")
# print(scores, scores.mean())
knn.fit(X_train, Y_train)
y_preds_knn = knn.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, knn.classes_ == 1].squeeze()

Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [10]:
log_reg = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    LogisticRegression(max_iter = 1000))
# cv = KFold(n_splits=5, random_state=0, shuffle=True)
# scores = cross_val_score(log_reg, x_train, y_train, cv=cv, scoring="roc_auc")
# print(scores, scores.mean())
log_reg.fit(X_train, Y_train)
y_preds_log_reg = log_reg.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, log_reg.classes_ == 1].squeeze()

In [None]:
# xgb_clas = make_pipeline(StandardScaler(),
#                     #SimpleImputer(strategy='mean'),
#                     xgb.XGBClassifier(n_estimators=100))
# cv = KFold(n_splits=5, random_state=0, shuffle=True)
# scores = cross_val_score(xgb_clas, x_train, y_train, cv=cv, scoring="roc_auc")
# print(scores, scores.mean())

In [None]:
# svm = make_pipeline(StandardScaler(),
#                     SimpleImputer(strategy='mean'),
#                     svm.SVC(kernel='rbf'))
# svm.fit(X_train, Y_train)
# print(svm.score(X_val, Y_val))

In [12]:
grad_boosting = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    GradientBoostingClassifier(random_state=0))
# cv = KFold(n_splits=5, random_state=0, shuffle=True)
# scores = cross_val_score(grad_boosting, x_train, y_train, cv=cv, scoring="roc_auc")
# print(scores, scores.mean())
grad_boosting.fit(X_train, Y_train)
y_preds_grad_boosting = grad_boosting.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, grad_boosting.classes_ == 1].squeeze()

In [13]:
rand_for = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    RandomForestClassifier(n_estimators=200, random_state=0))
# cv = KFold(n_splits=5, random_state=0, shuffle=True)
# scores = cross_val_score(rand_for, x_train, y_train, cv=cv, scoring="roc_auc")
# print(scores, scores.mean())
rand_for.fit(X_train, Y_train)
y_preds_rand_for = rand_for.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, rand_for.classes_ == 1].squeeze()

In [7]:
# j = 0
# for i in tqdm(models):
#     model = i
#     cv = KFold(n_splits=5, random_state=0, shuffle=True)
#     KFold_Score[classifiers[j]] = (cross_val_score(model, X_train, Y_train, scoring = 'roc-auc', cv=cv))
#     j = j+1

  0%|          | 0/8 [00:00<?, ?it/s]

In [14]:
y_preds_xgboost = pd.read_csv("./outputs/hyperopt.csv")["conversion"]
y_preds_hist_grad = pd.read_csv("./outputs/hist_gradient2.csv")["conversion"]
y_preds_xgboost

0        0.030675
1        0.000123
2        0.167858
3        0.000330
4        0.276743
           ...   
19206    0.113807
19207    0.226777
19208    0.000325
19209    0.077963
19210    0.000091
Name: conversion, Length: 19211, dtype: float64

In [17]:
#promedia todos los y_preds
y_preds = (y_preds_clf_sgd + y_preds_knn + y_preds_log_reg + y_preds_grad_boosting + y_preds_rand_for + y_preds_xgboost + y_preds_hist_grad)/7
y_preds

0        0.049499
1        0.000365
2        0.154394
3        0.000995
4        0.252430
           ...   
19206    0.098674
19207    0.244494
19208    0.000473
19209    0.080408
19210    0.000408
Name: conversion, Length: 19211, dtype: float64

In [24]:
submission_df = pd.DataFrame({"ROW_ID": list(range(19211)), "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./outputs/promedios_totales.csv", sep=",", index=False)