In [1]:
import os
import gc
import json
import time
from datetime import datetime
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

In [2]:
def process_dfs(train_df, test_df):
    print("Processing dfs...")
    print("Dropping repeated columns...")
    columns = [col for col in train_df.columns if train_df[col].nunique() > 1]

    train_df = train_df[columns]
    test_df = test_df[columns]

    train_len = train_df.shape[0]

    merged_df = pd.concat([train_df, test_df])

    merged_df['total_visitId_time'] = merged_df['visitId'] - merged_df['visitStartTime']
    merged_df['total_visitId_time'] = (merged_df['total_visitId_time'] != 0).astype(int)
    del merged_df['visitId']

    print("Generating date columns...")
    
    format_str = '%Y%m%d'
    merged_df['format_date'] = merged_df['date'].apply(lambda x: datetime.strptime(str(x), format_str))
    merged_df['Week_of_Year'] = merged_df['format_date'].apply(lambda x: x.isocalendar()[1])
    merged_df['month'] = merged_df['format_date'].apply(lambda x:x.month)
    merged_df['quarter_month'] = merged_df['format_date'].apply(lambda x:x.day//8)
    merged_df['weekday'] = merged_df['format_date'].apply(lambda x:x.weekday())

    del merged_df['date']
    del merged_df['format_date']

    merged_df['format_visitStartTime'] = merged_df['visitStartTime'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x)))
    merged_df['format_visitStartTime'] = pd.to_datetime(merged_df['format_visitStartTime'])
    merged_df['visit_hour'] = merged_df['format_visitStartTime'].apply(lambda x: x.hour)

    del merged_df['visitStartTime']
    del merged_df['format_visitStartTime']

    print("Encoding columns with pd.factorize()")
    
    for col in merged_df.columns:
        if col in ['fullVisitorId', 'month', 'quarter_month', 'weekday', 'visit_hour', 'Week_of_Year']: continue
        if merged_df[col].dtypes == object or merged_df[col].dtypes == bool: merged_df[col], indexer = pd.factorize(merged_df[col])

    print("Splitting back...")
    train_df = merged_df[:train_len]
    test_df = merged_df[train_len:]
    print("Done!")

    return train_df, test_df

In [3]:
def preprocess():
    train_df = pd.read_csv('train-flattened.csv', dtype = {'fullVisitorId' : np.str})
    test_df = pd.read_csv('test-flattened.csv', dtype = {'fullVisitorId' : np.str})

    target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
    target = target.apply(lambda x: np.log1p(x))

    del train_df['totals.transactionRevenue']

    train_df, test_df = process_dfs(train_df, test_df)
    train_df.to_csv('train-flat-clean.csv', index=False)
    test_df.to_csv('test-flat-clean.csv', index=False)
    target.to_csv('target.csv', index=False)

In [4]:
preprocess()

Processing dfs...
Dropping repeated columns...
Generating date columns...
Encoding columns with pd.factorize()
Splitting back...
Done!


In [5]:
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

def load_preprocessed_dfs(drop_full_visitor_id=True):

    X_train = pd.read_csv('train-flat-clean.csv', converters={'fullVisitorId': str})
    X_test = pd.read_csv('test-flat-clean.csv', converters={'fullVisitorId': str})
    y_train = pd.read_csv('target.csv', names=['LogRevenue']).T.squeeze()
    
    # This is the only `object` column, we drop it for train and evaluation
    if drop_full_visitor_id: 
        X_train = X_train.drop(['fullVisitorId'], axis=1)
        X_test = X_test.drop(['fullVisitorId'], axis=1)
    return X_train, y_train, X_test

In [6]:
X, y, X_test = load_preprocessed_dfs()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test (submit) shape: {X_test.shape}")

Train shape: (1452086, 28)
Validation shape: (256251, 28)
Test (submit) shape: (401589, 28)


In [7]:
def run_lgb(X_train, y_train, X_val, y_val, X_test):
    
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : 1,
        "seed": 42
    }
    
    lgb_train_data = lgb.Dataset(X_train, label=y_train)
    lgb_val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, lgb_train_data, 
                      num_boost_round=5000,
                      valid_sets=[lgb_train_data, lgb_val_data],
                      early_stopping_rounds=100,
                      verbose_eval=500)

    y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_submit = model.predict(X_test, num_iteration=model.best_iteration)

    print(f"LGBM: RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [8]:
def run_xgb(X_train, y_train, X_val, y_val, X_test):
    params = {'objective': 'reg:linear',
              'eval_metric': 'rmse',
              'eta': 0.001,
              'max_depth': 10,
              'subsample': 0.6,
              'colsample_bytree': 0.6,
              'alpha':0.001,
              'random_state': 42,
              'silent': True}

    xgb_train_data = xgb.DMatrix(X_train, y_train)
    xgb_val_data = xgb.DMatrix(X_val, y_val)
    xgb_submit_data = xgb.DMatrix(X_test)

    model = xgb.train(params, xgb_train_data, 
                      num_boost_round=2000, 
                      evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')],
                      early_stopping_rounds=100, 
                      verbose_eval=500
                     )

    y_pred_train = model.predict(xgb_train_data, ntree_limit=model.best_ntree_limit)
    y_pred_val = model.predict(xgb_val_data, ntree_limit=model.best_ntree_limit)
    y_pred_submit = model.predict(xgb_submit_data, ntree_limit=model.best_ntree_limit)

    print(f"XGB : RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [9]:
def run_catboost(X_train, y_train, X_val, y_val, X_test):
    model = CatBoostRegressor(iterations=1000,
                             learning_rate=0.05,
                             depth=10,
                             eval_metric='RMSE',
                             random_seed = 42,
                             bagging_temperature = 0.2,
                             od_type='Iter',
                             metric_period = 50, 
                             od_wait=20)
    model.fit(X_train, y_train,
              eval_set=(X_val, y_val),
              use_best_model=True,
              verbose=50)
    
    y_pred_train = model.predict(X_train)
    y_pred_val = model.predict(X_val)
    y_pred_submit = model.predict(X_test)

    print(f"CatB: RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [10]:
%%time
# Train LGBM and generate predictions
lgb_preds, lgb_model = run_lgb(X_train, y_train, X_val, y_val, X_test)

Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 1.54444	valid_1's rmse: 1.5495
[1000]	training's rmse: 1.5117	valid_1's rmse: 1.52686
[1500]	training's rmse: 1.49546	valid_1's rmse: 1.52026
[2000]	training's rmse: 1.48424	valid_1's rmse: 1.51768
[2500]	training's rmse: 1.47435	valid_1's rmse: 1.51598
[3000]	training's rmse: 1.46397	valid_1's rmse: 1.51342
[3500]	training's rmse: 1.45515	valid_1's rmse: 1.5119
[4000]	training's rmse: 1.44722	valid_1's rmse: 1.51121
[4500]	training's rmse: 1.43982	valid_1's rmse: 1.51063
Early stopping, best iteration is:
[4821]	training's rmse: 1.43559	valid_1's rmse: 1.51038
LGBM: RMSE val: 1.51038  - RMSE train: 1.43559
Wall time: 18min 51s


In [11]:
%%time
# Train XGBoost and generate predictions
xgb_preds, xgb_model = run_xgb(X_train, y_train, X_val, y_val, X_test)

[0]	train-rmse:1.86942	valid-rmse:1.86609
Multiple eval metrics have been passed: 'valid-rmse' will be used for early stopping.

Will train until valid-rmse hasn't improved in 100 rounds.
[500]	train-rmse:1.68952	valid-rmse:1.69781
[1000]	train-rmse:1.59244	valid-rmse:1.61379
[1500]	train-rmse:1.53483	valid-rmse:1.56982
[1999]	train-rmse:1.49826	valid-rmse:1.54687
XGB : RMSE val: 1.54687  - RMSE train: 1.49826
Wall time: 2h 15min 39s


In [12]:
%%time
# Train Catboost and generate predictions
cat_preds, cat_model = run_catboost(X_train, y_train, X_val, y_val,  X_test)



0:	learn: 1.8293702	test: 1.8260435	best: 1.8260435 (0)	total: 1.36s	remaining: 22m 41s
50:	learn: 1.5428422	test: 1.5470487	best: 1.5470487 (50)	total: 55.8s	remaining: 17m 18s
100:	learn: 1.5191854	test: 1.5337330	best: 1.5337330 (100)	total: 1m 49s	remaining: 16m 18s
150:	learn: 1.5034843	test: 1.5286896	best: 1.5286896 (150)	total: 2m 43s	remaining: 15m 18s
200:	learn: 1.4902613	test: 1.5253440	best: 1.5253438 (199)	total: 3m 36s	remaining: 14m 21s
250:	learn: 1.4780556	test: 1.5228015	best: 1.5228015 (250)	total: 4m 30s	remaining: 13m 26s
300:	learn: 1.4673304	test: 1.5217889	best: 1.5217889 (300)	total: 5m 22s	remaining: 12m 29s
350:	learn: 1.4583313	test: 1.5206993	best: 1.5206233 (341)	total: 6m 16s	remaining: 11m 35s
400:	learn: 1.4487933	test: 1.5193488	best: 1.5193447 (398)	total: 7m 9s	remaining: 10m 41s
450:	learn: 1.4393074	test: 1.5186329	best: 1.5184995 (443)	total: 8m 2s	remaining: 9m 47s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 1.518499525
bes

In [15]:
ensemble_preds_70_30_00 = 0.7 * lgb_preds + 0.3 * cat_preds + 0.0 * xgb_preds 
ensemble_preds_70_25_05 = 0.7 * lgb_preds + 0.25 * cat_preds + 0.05 * xgb_preds 

def submission(predictions, filename='submit.csv'):
    # Takes a 1d-array of predictions and generates a submission file
    _, _, X_submit = load_preprocessed_dfs(drop_full_visitor_id=False)
    submission = X_submit[['fullVisitorId']].copy()
    submission.loc[:, 'PredictedLogRevenue'] = predictions
    grouped_test = submission[['fullVisitorId', 'PredictedLogRevenue']].groupby('fullVisitorId').sum().reset_index()
    grouped_test.to_csv(filename,index=False)

In [16]:
submission(lgb_preds, "submit-lgb.csv")
submission(xgb_preds, "submit-xgb.csv")
submission(cat_preds, "submit-cat.csv")
submission(ensemble_preds_70_30_00, "submit-ensemble-70-30-00.csv")
submission(ensemble_preds_70_25_05, "submit-ensemble-70-25-05.csv")

ensemble_preds_70_30_00_pos = np.where(ensemble_preds_70_30_00 < 0, 0, ensemble_preds_70_30_00)
submission(ensemble_preds_70_30_00_pos, "submit-ensemble-70-30-00-positive.csv")

ensemble_preds_70_25_05_pos = np.where(ensemble_preds_70_25_05 < 0, 0, ensemble_preds_70_25_05)
submission(ensemble_preds_70_25_05_pos, "submit-ensemble-70-25-05-positive.csv")