In [1]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,  HistGradientBoostingClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from tqdm import tqdm
from sklearn.metrics import roc_auc_score



In [2]:
data = pd.read_csv("./data/competition_data.csv")

data.drop(columns=['benefit', 'category_id', 'deal_print_id','etl_version', 'full_name','product_id'
                   , 'item_id', 'main_picture', 'site_id', 'uid', 'user_id', 'title', 'tags', 'warranty'],
                     axis= "columns", inplace=True)

data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
#data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp", 'date', "domain_id"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)
data = pd.get_dummies(data,columns = ["is_pdp"],dummy_na = True, dtype = bool )

In [3]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])
y_test = test_data["conversion"]

In [16]:
clf_sgd = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    SGDClassifier(max_iter=1000, tol=1e-3, loss="log_loss", random_state=42))
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(clf_sgd, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores, scores.mean())

[0.85521922 0.85157454 0.8588638  0.85373505 0.85559406] 0.8549973323937413


In [17]:
knn = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    KNeighborsClassifier(n_neighbors=50, weights="uniform"))
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(knn, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores, scores.mean())

[0.84894193 0.84575146 0.85096472 0.8444464  0.84323644] 0.8466681893446184


In [18]:
log_reg = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    LogisticRegression(max_iter = 1000))
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(log_reg, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores, scores.mean())

[0.86233858 0.86426922 0.86425809 0.858776   0.8596858 ] 0.8618655366509473


In [19]:
xgb_clas = make_pipeline(StandardScaler(),
                    #SimpleImputer(strategy='mean'),
                    xgb.XGBClassifier(n_estimators=100))
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(xgb_clas, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores, scores.mean())

  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):
  if is_sparse(data):


[0.88778413 0.8890543  0.88820217 0.88539566 0.88694771] 0.8874767960586262


In [10]:
# svm = make_pipeline(StandardScaler(),
#                     SimpleImputer(strategy='mean'),
#                     svm.SVC(kernel='rbf'))
# svm.fit(X_train, Y_train)
# print(svm.score(X_val, Y_val))

In [20]:
grad_boosting = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    GradientBoostingClassifier(random_state=0))
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(grad_boosting, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores, scores.mean())

[0.8768275  0.87825239 0.87803394 0.87379062 0.87517396] 0.8764156816695359


In [21]:
rand_for = make_pipeline(StandardScaler(),
                    SimpleImputer(strategy='mean'),
                    RandomForestClassifier(n_estimators=200, random_state=0))
cv = KFold(n_splits=5, random_state=0, shuffle=True)
scores = cross_val_score(rand_for, x_train, y_train, cv=cv, scoring="roc_auc")
print(scores, scores.mean())

[0.86443793 0.8651955  0.86581387 0.86061732 0.8659184 ] 0.8643966033979809


In [7]:
# j = 0
# for i in tqdm(models):
#     model = i
#     cv = KFold(n_splits=5, random_state=0, shuffle=True)
#     KFold_Score[classifiers[j]] = (cross_val_score(model, X_train, Y_train, scoring = 'roc-auc', cv=cv))
#     j = j+1

  0%|          | 0/8 [00:00<?, ?it/s]