In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = 999

from dateutil.relativedelta import relativedelta

In [2]:
new_train = pd.read_csv("../data/new_train.csv")

In [3]:
train = new_train.copy()
train = train.iloc[:10000].melt(id_vars="Page", value_vars=list(train.columns[1:]), var_name="Date", value_name="Visits")

In [4]:
train["Visits"].fillna(0, inplace=True)
train["Visits"] = train["Visits"].astype(int)
train["Date"] = train["Date"].astype('datetime64[ns]')
train['Year'] = train["Date"].dt.year
train['Month'] = train["Date"].dt.month
train['Day'] = train["Date"].dt.day
train['DayOfWeek'] = train["Date"].dt.dayofweek

In [5]:
def create_features_last_year(df):
    for year in [1]:
        for month in [0]:
            for day in [-1, 0, 1]:
                temp = df.copy()
                temp["Date"] = temp["Date"].apply(lambda x: x + relativedelta(years=year, months=month, days=day))
                temp.rename(columns={"Visits": "prev_Visits_{}_{}_{}".format(year, month, day)}, inplace=True)
                df = df.merge(temp[["Page", "Date", "prev_Visits_{}_{}_{}".format(year, month, day)]], on=["Page", "Date"], how='left')
                df["prev_Visits_{}_{}_{}".format(year, month, day)] = df["prev_Visits_{}_{}_{}".format(year, month, day)].fillna(0).astype(int)
    return df

In [6]:
train = create_features_last_year(train)

In [7]:
def create_validation(df, default_month):
    return df.loc[(df["Date"] <= pd.to_datetime(default_month) - relativedelta(months=1)) & \
                  (df["Date"] >= pd.to_datetime(default_month) - relativedelta(months=3))].index, \
           df.loc[(df["Date"] >= pd.to_datetime(default_month)) & \
                  (df["Date"] <= pd.to_datetime(default_month) + relativedelta(months=2))].index

In [8]:
validation_months = ['2016-11-10', '2016-12-10', '2017-01-10', '2017-02-10', '2017-03-10']
validation = []
for month in validation_months:
    validation.append(create_validation(train, month))
    print("Train:", str(train.loc[validation[-1][0], "Date"].min())[:10], "–", str(train.loc[validation[-1][0], "Date"].max())[:10],
          "    Validation:", str(train.loc[validation[-1][1], "Date"].min())[:10], "–", str(train.loc[validation[-1][1], "Date"].max())[:10])

Train: 2016-08-10 – 2016-10-10     Validation: 2016-11-10 – 2017-01-10
Train: 2016-09-10 – 2016-11-10     Validation: 2016-12-10 – 2017-02-10
Train: 2016-10-10 – 2016-12-10     Validation: 2017-01-10 – 2017-03-10
Train: 2016-11-10 – 2017-01-10     Validation: 2017-02-10 – 2017-04-10
Train: 2016-12-10 – 2017-02-10     Validation: 2017-03-10 – 2017-05-10


In [9]:
import lightgbm as lgb
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param["verbose"] = 0

from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape_fast(labels, preds), False

def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [10]:
def create_lag_features(df, target, indexes, lags):
    temp = pd.pivot_table(df.loc[(df["Date"] > df.loc[indexes, "Date"].min() - relativedelta(days=8+lags))&
                                 (df["Date"] < df.loc[indexes, "Date"].min() - relativedelta(days=8))], 
                          index=["Page"], values=["Visits"], columns=["Date"]).reset_index()
    temp.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(temp.columns)]
    target = target.merge(temp, how='left')
    return target

def create_agg_features(df, indexes, lags, columns, name):
    temp = df.loc[(df["Date"] > df.loc[indexes, "Date"].min() - relativedelta(months=2, days=8+lags))&
                  (df["Date"] < df.loc[indexes, "Date"].min() - relativedelta(days=8))].groupby(columns)["Visits"].agg(["median", "mean", "std", "min", "max"]).reset_index()
    temp.columns = columns + [col + name + str(lags) for col in ["median", "mean", "std", "min", "max"]]
    return temp

def creating_features(df, indexes):
    temp = df.loc[indexes].merge(create_agg_features(df, indexes, 10, ["Page", "DayOfWeek"], "_p_d_"), how='left', on=["Page", "DayOfWeek"])
    temp = temp.merge(create_agg_features(train, indexes, 10, ["Page"], "_p_"), how='left', on=["Page"])
    # create lag features
    temp = create_lag_features(train, temp, indexes, 20)
    # months from 0 to 2
    for i, j in enumerate(temp["Month"].unique()):
        temp["Month"].replace(j, i, inplace=True)
    return temp

In [11]:
results = {"train": [], "val": [], "iteration": [], "baseline": []}
for small_train, small_val in validation:
    # TRAIN & VAL
    # create aggregated features
    fold_train = creating_features(train, small_train)
    fold_val = creating_features(train, small_val)
    
    # apply log to all numeric features
    numeric_features = ["Visits"] + list(fold_train.columns[7:])
    fold_train[numeric_features] = np.log1p(fold_train[numeric_features])
    fold_val[numeric_features] = np.log1p(fold_val[numeric_features])
    
    # preparing data from model
    train_features = list(fold_train.columns[4:])
    lgb_train = lgb.Dataset(fold_train[train_features], label=fold_train["Visits"], free_raw_data=False)
    lgb_val = lgb.Dataset(fold_val[train_features], label=fold_val["Visits"], free_raw_data=False, reference=lgb_train)
    
    # model train
    model = lgb.train(param, lgb_train, 1000, valid_sets=[lgb_train, lgb_val], feval=lgb_smape, early_stopping_rounds=10, verbose_eval=0)
    
    #save results
    results['train'].append(model.best_score['training']['smape'])
    results['val'].append(model.best_score['valid_1']['smape'])
    results['iteration'].append(int(model.best_iteration))
    results["baseline"].append(smape(np.expm1(fold_val["Visits"]), np.expm1(fold_val["median_p_d_10"])))

In [12]:
pd.DataFrame.from_dict(results)

Unnamed: 0,baseline,iteration,train,val
0,46.772513,57,44.398178,44.325226
1,46.78798,31,47.555058,45.232226
2,44.599118,57,43.749827,46.21944
3,43.304028,28,44.628191,41.332748
4,41.378803,39,42.364963,39.876395
