In [11]:
import os
import gc
import json
import time
from datetime import datetime
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings('ignore')

#from custom_functions import process_dfs, preprocess

In [3]:
def process_dfs(train_df, test_df):
    print("Processing dfs...")
    print("Dropping repeated columns...")
    columns = [col for col in train_df.columns if train_df[col].nunique() > 1]

    train_df = train_df[columns]
    test_df = test_df[columns]

    trn_len = train_df.shape[0]

    merged_df = pd.concat([train_df, test_df])

    merged_df['diff_visitId_time'] = merged_df['visitId'] - merged_df['visitStartTime']
    merged_df['diff_visitId_time'] = (merged_df['diff_visitId_time'] != 0).astype(int)
    del merged_df['visitId']

    print("Generating date columns...")
    format_str = '%Y%m%d'
    merged_df['formated_date'] = merged_df['date'].apply(lambda x: datetime.strptime(str(x), format_str))
    merged_df['WoY'] = merged_df['formated_date'].apply(lambda x: x.isocalendar()[1])
    merged_df['month'] = merged_df['formated_date'].apply(lambda x:x.month)
    merged_df['quarter_month'] = merged_df['formated_date'].apply(lambda x:x.day//8)
    merged_df['weekday'] = merged_df['formated_date'].apply(lambda x:x.weekday())

    del merged_df['date']
    del merged_df['formated_date']

    merged_df['formated_visitStartTime'] = merged_df['visitStartTime'].apply(lambda x: time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(x)))
    merged_df['formated_visitStartTime'] = pd.to_datetime(merged_df['formated_visitStartTime'])
    merged_df['visit_hour'] = merged_df['formated_visitStartTime'].apply(lambda x: x.hour)

    del merged_df['visitStartTime']
    del merged_df['formated_visitStartTime']

    print("Encoding columns with pd.factorize()")
    for col in merged_df.columns:
        if col in ['fullVisitorId', 'month', 'quarter_month', 'weekday', 'visit_hour', 'WoY']: continue
        if merged_df[col].dtypes == object or merged_df[col].dtypes == bool: merged_df[col], indexer = pd.factorize(merged_df[col])

    print("Splitting back...")
    train_df = merged_df[:trn_len]
    test_df = merged_df[trn_len:]
    print("Done!")

    return train_df, test_df

In [4]:
def preprocess():
    train_df = pd.read_csv('train-flattened.csv', dtype = {'fullVisitorId' : np.str})
    test_df = pd.read_csv('test-flattened.csv', dtype = {'fullVisitorId' : np.str})

    target = train_df['totals.transactionRevenue'].fillna(0).astype(float)
    target = target.apply(lambda x: np.log1p(x))

    del train_df['totals.transactionRevenue']

    train_df, test_df = process_dfs(train_df, test_df)
    train_df.to_csv('train-flat-clean.csv', index=False)
    test_df.to_csv('test-flat-clean.csv', index=False)
    target.to_csv('target.csv', index=False)

In [5]:
preprocess()

Processing dfs...
Dropping repeated columns...
Generating date columns...
Encoding columns with pd.factorize()
Splitting back...
Done!


In [9]:
def rmse(y_true, y_pred):
    return round(np.sqrt(mean_squared_error(y_true, y_pred)), 5)

def load_preprocessed_dfs(drop_full_visitor_id=True):

    X_train = pd.read_csv('train-flat-clean.csv', converters={'fullVisitorId': str})
    X_test = pd.read_csv('test-flat-clean.csv', converters={'fullVisitorId': str})
    y_train = pd.read_csv('target.csv', names=['LogRevenue']).T.squeeze()
    
    # This is the only `object` column, we drop it for train and evaluation
    if drop_full_visitor_id: 
        X_train = X_train.drop(['fullVisitorId'], axis=1)
        X_test = X_test.drop(['fullVisitorId'], axis=1)
    return X_train, y_train, X_test

In [12]:
X, y, X_test = load_preprocessed_dfs()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, random_state=1)

print(f"Train shape: {X_train.shape}")
print(f"Validation shape: {X_val.shape}")
print(f"Test (submit) shape: {X_test.shape}")

Train shape: (1452086, 28)
Validation shape: (256251, 28)
Test (submit) shape: (401589, 28)


In [13]:
def run_lgb(X_train, y_train, X_val, y_val, X_test):
    
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.005,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgb_train_data = lgb.Dataset(X_train, label=y_train)
    lgb_val_data = lgb.Dataset(X_val, label=y_val)

    model = lgb.train(params, lgb_train_data, 
                      num_boost_round=5000,
                      valid_sets=[lgb_train_data, lgb_val_data],
                      early_stopping_rounds=100,
                      verbose_eval=500)

    y_pred_train = model.predict(X_train, num_iteration=model.best_iteration)
    y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
    y_pred_submit = model.predict(X_test, num_iteration=model.best_iteration)

    print(f"LGBM: RMSE val: {rmse(y_val, y_pred_val)}  - RMSE train: {rmse(y_train, y_pred_train)}")
    return y_pred_submit, model

In [14]:
%%time
# Train LGBM and generate predictions
lgb_preds, lgb_model = run_lgb(X_train, y_train, X_val, y_val, X_test)

Training until validation scores don't improve for 100 rounds.
[500]	training's rmse: 1.54423	valid_1's rmse: 1.54925
[1000]	training's rmse: 1.51136	valid_1's rmse: 1.52649
[1500]	training's rmse: 1.49538	valid_1's rmse: 1.51993
[2000]	training's rmse: 1.48426	valid_1's rmse: 1.51742
[2500]	training's rmse: 1.47444	valid_1's rmse: 1.51563
[3000]	training's rmse: 1.46374	valid_1's rmse: 1.51284
[3500]	training's rmse: 1.45537	valid_1's rmse: 1.51164
[4000]	training's rmse: 1.44742	valid_1's rmse: 1.51102
[4500]	training's rmse: 1.44024	valid_1's rmse: 1.51045
Early stopping, best iteration is:
[4541]	training's rmse: 1.43978	valid_1's rmse: 1.51041
LGBM: RMSE val: 1.51041  - RMSE train: 1.43978
Wall time: 16min 6s


In [15]:
print("LightGBM features importance...")
gain = lgb_model.feature_importance('gain')
featureimp = pd.DataFrame({'feature': lgb_model.feature_name(), 
                   'split': lgb_model.feature_importance('split'), 
                   'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
print(featureimp[:10])

LightGBM features importance...
                       feature  split       gain
14            totals.pageviews  21137  36.398628
13                 totals.hits  22087  16.113187
8           geoNetwork.country   5494   8.035801
1                  visitNumber  13343   7.912991
12     geoNetwork.subContinent   2227   3.410221
23                         WoY  17384   3.330303
19  trafficSource.referralPath   4711   2.745126
9             geoNetwork.metro   8117   2.621198
27                  visit_hour  12882   2.233870
6              geoNetwork.city  10016   1.758559
