In [5]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.metrics import roc_auc_score
from scipy.stats import uniform, loguniform
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import seaborn as sns
from google.colab import files

In [6]:
data = pd.read_csv("./competition_data.csv")

data.drop(columns=['benefit', 'deal_print_id','etl_version', 'full_name',
                   'warranty', 'item_id', 'main_picture',
                    'site_id','uid', 'user_id', 'category_id', 'title', 'tags'], axis= "columns", inplace=True)

data['is_pdp'] = data['is_pdp'].astype('bool')
data['conversion'] = data['conversion'].astype('bool')
data["print_server_timestamp"] = pd.to_datetime(data["print_server_timestamp"])
data["hour"] = data["print_server_timestamp"].dt.hour
data["day"] = data["print_server_timestamp"].dt.day
data["minute"] = data["print_server_timestamp"].dt.minute
data["month"] = data["print_server_timestamp"].dt.month
data.drop(columns=["print_server_timestamp", 'date', "domain_id"], axis= "columns", inplace=True)
data = pd.get_dummies(data,columns = ["listing_type_id", "logistic_type", "platform"],dummy_na = False, dtype = int)

In [7]:
train_data = data[data["ROW_ID"].isna()]
test_data = data[data["ROW_ID"].notna()]

x_train = train_data.drop(columns=["conversion", "ROW_ID"])
y_train = train_data["conversion"]
X_train, X_val, Y_train, Y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=3456)
X_test = test_data.drop(columns=["conversion"])

In [8]:
params = {
    'colsample_bytree': uniform(0.5, 0.5),
    'gamma': uniform(0.3, 0.5),
    'learning_rate': loguniform(1e-3, 1e-1),
    'max_depth': list(range(5, 15)),
    'n_estimators': list(range(10, 30, 1)),
    'subsample': uniform(0.5, 0.5)
}

rs = RandomizedSearchCV(estimator = xgb.XGBClassifier(objective = 'binary:logistic', seed = 100, eval_metric = 'auc'),
                        param_distributions = params,
                        n_iter = 100,
                        cv = KFold(4),
                        random_state = 22)

rs.fit(x_train, y_train)
print(rs.best_score_)
print(rs.best_params_)

0.9101963322756718
{'colsample_bytree': 0.8998059277695778, 'gamma': 0.6773416677178381, 'learning_rate': 0.05178879822767806, 'max_depth': 11, 'n_estimators': 28, 'subsample': 0.8543529829350351}


In [10]:
clf = xgb.XGBClassifier(
    objective = 'binary:logistic',
    seed = 100,
    eval_metric = 'auc',
    **rs.best_params_)

clf.fit(x_train, y_train, verbose = True)

In [11]:
y_preds = clf.predict_proba(X_test.drop(columns=["ROW_ID"]))[:, clf.classes_== 1].squeeze()
submission_df = pd.DataFrame({"ROW_ID": X_test["ROW_ID"], "conversion": y_preds})
submission_df["ROW_ID"] = submission_df["ROW_ID"].astype(int)
submission_df.to_csv("./randomized_search.csv", sep=",", index=False)

In [12]:
files.download('./randomized_search.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>