# Library

In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import os

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Scoring functions

In [None]:
def smape(y_true, y_pred):
    """
    Scoring function
    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)


def smape_serie(x):
    """
    Scoring function on serie
    """
    return smape(y_pred=x.Visits, y_true=x.value)


# Helping functions

In [None]:
def create_train():
    if os.path.isfile("../data/work/train.pickle"):
        data = pd.read_pickle("../data/work/train.pickle")
    else:
        data = pd.read_csv('../data/input/train_2.csv')
        cols = data.columns[data.columns.str.contains("-")].tolist()
        data["Page"] = data["Page"].astype(str)
        data = data.set_index("Page").T
        data.index = pd.to_datetime(data.index, format="%Y-%m-%d")
        data.to_pickle("../data/work/train.pickle")
    return data


def create_test():
    if os.path.isfile("../data/work/test.pickle"):
        df_test = pd.read_pickle("../data/work/test.pickle")
    else:
        df_test = pd.read_csv("../data/input/key_2.csv")
        df_test['date'] = df_test.Page.apply(lambda a: a[-10:])
        df_test['Page'] = df_test.Page.apply(lambda a: a[:-11])
        df_test['date'] = pd.to_datetime(df_test['date'], format="%Y-%m-%d")
        df_test.to_pickle("../data/work/test.pickle")
    return df_test

# Read data

In [None]:
data = create_train()
print(data.info())
data.head()

# Train / Test

In [None]:
## Split in train / test to evaluate scoring 
train = data.iloc[:-60]
test = data.iloc[-60:]
print(train.shape)
print(test.shape)
print(data.shape)

# Median model

## Test

Median model with several windows combination

In [None]:
def add_is_weekend(df):
    return df.assign(is_weekend=lambda x: x.index.dayofweek.isin([0, 1]))

In [None]:
from functools import reduce

Windows = [6, 12, 18, 30, 48, 78, 126, 203, 329]
train = add_is_weekend(train.iloc[-329:])
median_model = []
for x in reversed(Windows):
    median_model.append(train.iloc[-x:].groupby("is_weekend").median().stack())

median_model = reduce(
    lambda x, y: pd.merge(x, y, on=['is_weekend', "Page"], how="outer"),
    [x.reset_index()
     for x in median_model]).set_index(["is_weekend", "Page"]).median(
         axis=1).reset_index().rename(columns={0: "Visits"})

median_model = pd.merge(
    add_is_weekend(test).reset_index(drop=True).melt(
        id_vars="is_weekend").dropna(),
    median_model,
    on=["Page", "is_weekend"],
    how="outer").fillna(0)

print("SMAPE is : ")
print(smape(y_pred=median_model.Visits, y_true=median_model.value))

## Submission

In [None]:
df = add_is_weekend(data.iloc[-329:])
median_model_submission = []
for x in reversed(Windows):
    median_model_submission.append(
        df.iloc[-x:].groupby("is_weekend").median().stack())

median_model_submission = reduce(
    lambda x, y: pd.merge(x, y, on=['is_weekend', "Page"], how="outer"), [
        x.reset_index() for x in median_model_submission
    ]).set_index(["is_weekend", "Page"]).median(axis=1).reset_index().rename(
        columns={0: "Visits"})

df_test = create_test()
df_test["is_weekend"] = df_test.date.dt.dayofweek.isin([0, 1])

df_submit = pd.merge(
    df_test, median_model_submission, on=["Page", "is_weekend"],
    how="outer")[["Id", "Visits"]].fillna(0)
df_submit.to_csv(
    "../data/submission/submission_median_weekend_multiple_windows.csv", index=False)
df_submit.head()