In [1]:
import pandas as pd
import numpy as np

pd.options.display.max_columns = 999

from dateutil.relativedelta import relativedelta

In [2]:
train = pd.read_csv("../data/new_train.csv")
test = pd.read_csv("../data/key_2.csv")

In [3]:
train = train.iloc[:10000].melt(id_vars="Page", value_vars=list(train.columns[1:]), var_name="Date", value_name="Visits")
train["Visits"].fillna(0, inplace=True)
train["Visits"] = train["Visits"].astype(int)
train["Date"] = train["Date"].astype('datetime64[ns]')
train['Year'] = train["Date"].dt.year
train['Month'] = train["Date"].dt.month
train['Day'] = train["Date"].dt.day
train['DayOfWeek'] = train["Date"].dt.dayofweek

In [4]:
test = test.loc[test["Page"] < 10000]
test["Date"] = test["Id"].apply(lambda x: str(x.split("_")[1]))
test["Date"] = test["Date"].astype('datetime64[ns]')
test['Year'] = test["Date"].dt.year
test['Month'] = test["Date"].dt.month
test['Day'] = test["Date"].dt.day
test['DayOfWeek'] = test["Date"].dt.dayofweek

In [5]:
for year in [1]:
    for month in [0]:
        for day in [-1, 0, 1]:
            temp = train.copy()
            temp["Date"] = temp["Date"].apply(lambda x: x + relativedelta(years=year, months=month, days=day))
            temp = temp[~temp.duplicated(subset=["Page", "Date"], keep='first')]
            temp.rename(columns={"Visits": "prev_Visits_{}_{}_{}".format(year, month, day)}, inplace=True)
            break

In [6]:
def create_features_last_year(df, target_train, target_test):
    for year in [1]:
        for month in [0]:
            for day in [-1, 0, 1]:
                temp = df.copy()
                temp["Date"] = temp["Date"].apply(lambda x: x + relativedelta(years=year, months=month, days=day))
                temp = temp[~temp.duplicated(subset=["Page", "Date"], keep='first')]
                temp.rename(columns={"Visits": "prev_Visits_{}_{}_{}".format(year, month, day)}, inplace=True)
                target_train = target_train.merge(temp[["Page", "Date", "prev_Visits_{}_{}_{}".format(year, month, day)]], on=["Page", "Date"], how='left')
                target_train["prev_Visits_{}_{}_{}".format(year, month, day)] = target_train["prev_Visits_{}_{}_{}".format(year, month, day)].fillna(0).astype(int)
                target_test = target_test.merge(temp[["Page", "Date", "prev_Visits_{}_{}_{}".format(year, month, day)]], on=["Page", "Date"], how='left')
                target_test["prev_Visits_{}_{}_{}".format(year, month, day)] = target_test["prev_Visits_{}_{}_{}".format(year, month, day)].fillna(0).astype(int)
    return target_train, target_test

In [7]:
train, test = create_features_last_year(train, train, test)

In [8]:
default_month = '2017-05-10'
new_train = train.loc[(train["Date"] <= pd.to_datetime(default_month)) & \
                      (train["Date"] >= pd.to_datetime(default_month) - relativedelta(months=2))]

In [9]:
print("Train:", str(new_train["Date"].min())[:10], "–", str(new_train["Date"].max())[:10],
          "    Test:", str(test["Date"].min())[:10], "–", str(test["Date"].max())[:10])

Train: 2017-03-10 – 2017-05-10     Test: 2017-06-10 – 2017-08-10


In [10]:
import lightgbm as lgb
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param["verbose"] = 0

from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape_fast(labels, preds), False

In [11]:
def create_lag_features(df, target, lags):
    temp = pd.pivot_table(df.loc[(df["Date"] > target["Date"].min() - relativedelta(days=8+lags))&
                                 (df["Date"] < target["Date"].min() - relativedelta(days=8))], 
                          index=["Page"], values=["Visits"], columns=["Date"]).reset_index()
    temp.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(temp.columns)]
    target = target.merge(temp, how='left')
    return target

def create_agg_features(df, target, lags, columns, name):
    temp = df.loc[(df["Date"] > target["Date"].min() - relativedelta(months=2, days=8+lags))&
                  (df["Date"] < target["Date"].min() - relativedelta(days=8))].groupby(columns)["Visits"].agg(["median", "mean", "std", "min", "max"]).reset_index()
    temp.columns = columns + [col + name + str(lags) for col in ["median", "mean", "std", "min", "max"]]
    return temp

def creating_features(df, target):
    temp = target.merge(create_agg_features(df, target, 10, ["Page", "DayOfWeek"], "_p_d_"), how='left', on=["Page", "DayOfWeek"])
    temp = temp.merge(create_agg_features(df, target, 10, ["Page"], "_p_"), how='left', on=["Page"])
    # create lag features
    temp = create_lag_features(train, temp, 20)
    # months from 0 to 2
    for i, j in enumerate(temp["Month"].unique()):
        temp["Month"].replace(j, i, inplace=True)
    return temp

In [12]:
# TRAIN
# create aggregated features
new_train = creating_features(train, new_train)
test = creating_features(train, test)

# apply log to all numeric features
numeric_features = ["Visits"] + list(new_train.columns[7:])
new_train[numeric_features] = np.log1p(new_train[numeric_features])
test[numeric_features[1:]] = np.log1p(test[numeric_features[1:]])

# preparing data from model
train_features = list(new_train.columns[4:])
lgb_train = lgb.Dataset(new_train[train_features], label=new_train["Visits"], free_raw_data=False)

In [69]:
# TRAIN
# create aggregated features
new_train = new_train.merge(train.loc[(train["Date"] > new_train["Date"].min() - relativedelta(months=2, days=30))&
                                      (train["Date"] < new_train["Date"].min() - relativedelta(days=8))].groupby(["Page", "DayOfWeek"])["Visits"].agg(["median", "mean", "std", "min", "max"]).reset_index(), how='left', on=["Page", "DayOfWeek"])
new_train = new_train.merge(train.loc[(train["Date"] > new_train["Date"].min() - relativedelta(months=2, days=30))&
                                      (train["Date"] < new_train["Date"].min() - relativedelta(days=8))].groupby(["Page"])["Visits"].agg(["median", "mean", "std", "min", "max"]).reset_index(), how='left', on=["Page"])
# create lag features
new_train = create_lag_features(train, new_train, 20)
# months from 0 to 2
for i, j in enumerate(new_train["Month"].unique()):
    new_train["Month"].replace(j, i, inplace=True)

# VAL
# create aggregated features
test = test.merge(train.loc[(train["Date"] > test["Date"].min() - relativedelta(months=2, days=30))&
                            (train["Date"] < test["Date"].min() - relativedelta(days=8))].groupby(["Page", "DayOfWeek"])["Visits"].agg(["median", "mean", "std", "min", "max"]).reset_index(), how='left', on=["Page", "DayOfWeek"])
test = test.merge(train.loc[(train["Date"] > test["Date"].min() - relativedelta(months=2, days=30))&
                            (train["Date"] < test["Date"].min() - relativedelta(days=8))].groupby(["Page"])["Visits"].agg(["median", "mean", "std", "min", "max"]).reset_index(), how='left', on=["Page"])
# create lag features
test = create_lag_features(train, test, 20)
# months from 0 to 2
for i, j in enumerate(test["Month"].unique()):
    test["Month"].replace(j, i, inplace=True)

In [71]:
# apply log to all numeric features
numeric_features = list(new_train.columns[7:])
new_train[["Visits"] + numeric_features] = np.log1p(new_train[["Visits"] + numeric_features])
test[numeric_features] = np.log1p(test[numeric_features])

# preparing data from model
train_features = list(new_train.columns[4:])
lgb_train = lgb.Dataset(new_train[train_features], label=new_train["Visits"], free_raw_data=False)

In [19]:
model = lgb.train(param, lgb_train, 30, feval=lgb_smape, verbose_eval=0)

In [20]:
test["log_Visits"] = model.predict(test[train_features])
test["Visits"] = np.expm1(test["log_Visits"])
test.loc[test["Visits"] < 1, "Visits"] = 0

In [21]:
for i,j in zip(train_features, model.feature_importance()):
    print(i, j)

Month 24
Day 17
DayOfWeek 11
prev_Visits_1_0_-1 56
prev_Visits_1_0_0 37
prev_Visits_1_0_1 30
median_p_d_10 60
mean_p_d_10 14
std_p_d_10 2
min_p_d_10 43
max_p_d_10 1
median_p_10 47
mean_p_10 34
std_p_10 25
min_p_10 11
max_p_10 23
lag_1 19
lag_2 14
lag_3 7
lag_4 15
lag_5 13
lag_6 32
lag_7 26
lag_8 10
lag_9 14
lag_10 13
lag_11 17
lag_12 32
lag_13 50
lag_14 28
lag_15 19
lag_16 25
lag_17 37
lag_18 55
lag_19 39


In [22]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 200.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

In [23]:
true_answers = pd.read_csv("../data/test_new_data.csv")

In [24]:
true_answers.head()

Unnamed: 0,Id,Visits
0,0_2017-06-10,37.0
1,1_2017-06-10,30.0
2,2_2017-06-10,6.0
3,3_2017-06-10,15.0
4,4_2017-06-10,22.0


In [25]:
temp = true_answers.merge(test[["Id", "Visits", "median_p_d_10"]], how='inner', on=["Id"])

In [26]:
smape(temp["Visits_x"], np.expm1(temp["median_p_d_10"]))

42.949620114956176

In [29]:
smape(temp["Visits_x"], temp["Visits_y"])

38.476878112844801