In [1]:
import pandas as pd
import numpy as np
import gc

In [2]:
train = pd.read_csv("../data/new_train.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
train = train.loc[(train["date"] >= '2016-05-10') & (train["date"] <= '2016-08-31')]
train.fillna(0, inplace=True)
test = pd.read_csv("../data/new_test.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
test.fillna(0, inplace=True)
train["Visits"] = train["Visits"].astype("int32")
test["Visits"] = test["Visits"].astype("int32")

In [3]:
def pandas_smape(df):
    df.fillna(0, inplace=True)
    df["SMAPE"] = 200 * np.abs(df["Visits"] - df["pred_Visits"]) / (df["Visits"] + df["pred_Visits"])
    df["SMAPE"].fillna(0, inplace=True)
    return np.mean(df["SMAPE"])

## Last day baseline

In [4]:
baseline_df = train[train["date"] == '2016-08-31'].copy()
baseline_df.rename(columns={"Visits": "pred_Visits"}, inplace=True)
baseline_df.drop("date", axis=1, inplace=True)

In [5]:
new_test = test.merge(baseline_df, on="Page", how='left').copy()

In [6]:
pandas_smape(new_test)

54.438516144828704

In [7]:
gc.collect()

7

## Median baseline

In [8]:
train["weekend"] = ((train["date"].dt.dayofweek) // 5 == 1).astype(int)
test["weekend"] = ((test["date"].dt.dayofweek) // 5 == 1).astype(int)
new_test = test.copy()
baseline_df = train.loc[train["date"] >= '2016-08-01'].groupby(['Page','weekend']).median().reset_index()
baseline_df.rename(columns={"Visits": "pred_Visits"}, inplace=True)

In [9]:
new_test = new_test.merge(baseline_df, on=["Page", "weekend"], how='left')

In [10]:
pandas_smape(new_test)

50.539351281591962

In [11]:
gc.collect()

63

## Linear baseline

In [48]:
from sklearn.linear_model import LinearRegression

In [49]:
val = train.loc[(train["date"] >= '2016-06-10') & (train["date"] <= '2016-08-10')]
new_train = train.loc[train["date"] < '2016-06-10']

In [50]:
new_train = pd.pivot_table(new_train,index=["Page"], values=["Visits"], columns=["date"]).reset_index()
new_train.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(new_train.columns)]

In [51]:
new_train = val.merge(new_train, on="Page", how='left')

In [52]:
new_train['Month']      = new_train["date"].dt.month - 5
new_train['Day']        = new_train["date"].dt.day
new_train['DayOfWeek']  = new_train["date"].dt.dayofweek

In [53]:
lr = LinearRegression(n_jobs=-1)
lr.fit(new_train[new_train.columns[3:]], new_train["Visits"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [54]:
new_test = train.loc[(train["date"] >= '2016-07-10') & (train["date"] < '2016-08-10')]
new_test = pd.pivot_table(new_test,index=["Page"], values=["Visits"], columns=["date"]).reset_index()
new_test.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(new_test.columns)]
new_test = test.merge(new_test, on="Page", how='left')

new_test['Month']     = new_test["date"].dt.month - 8
new_test['Day']       = new_test["date"].dt.day
new_test['DayOfWeek'] = new_test["date"].dt.dayofweek

In [55]:
new_test["pred_Visits"] = lr.predict(new_test[new_test.columns[3:]])
gc.collect()

346

In [56]:
new_test.loc[new_test["pred_Visits"] < 3, "pred_Visits"] = 0
pandas_smape(new_test)

144.75810464169882

## Linear model with OHE date features

In [60]:
lr = LinearRegression(n_jobs=-1)
lr.fit(pd.get_dummies(new_train[new_train.columns[3:]],columns=["Month", "Day", "DayOfWeek"]), new_train["Visits"])

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [61]:
new_test = train.loc[(train["date"] >= '2016-07-10') & (train["date"] < '2016-08-10')]
new_test = pd.pivot_table(new_test,index=["Page"], values=["Visits"], columns=["date"]).reset_index()
new_test.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(new_test.columns)]
new_test = test.merge(new_test, on="Page", how='left')

new_test['Month']     = new_test["date"].dt.month - 8
new_test['Day']       = new_test["date"].dt.day
new_test['DayOfWeek'] = new_test["date"].dt.dayofweek
new_test["pred_Visits"] = lr.predict(pd.get_dummies(new_test[new_test.columns[3:]],columns=["Month", "Day", "DayOfWeek"]))
gc.collect()

1234

In [62]:
new_test.loc[new_test["pred_Visits"] < 3, "pred_Visits"] = 0
pandas_smape(new_test)

145.8104157943682