# Library

In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import os

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Scoring functions

In [2]:
def smape(y_true, y_pred):
    """
    Scoring function
    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)


def smape_serie(x):
    """
    Scoring function on serie
    """
    return smape(y_pred=x.Visits, y_true=x.value)


# Helping functions

In [3]:
def create_train():
    if os.path.isfile("../data/work/train.pickle"):
        data = pd.read_pickle("../data/work/train.pickle")
    else:
        data = pd.read_csv('../data/input/train_2.csv')
        cols = data.columns[data.columns.str.contains("-")].tolist()
        data["Page"] = data["Page"].astype(str)
        data = data.set_index("Page").T
        data.index = pd.to_datetime(data.index, format="%Y-%m-%d")
        data.to_pickle("../data/work/train.pickle")
    return data


def create_test():
    if os.path.isfile("../data/work/test.pickle"):
        df_test = pd.read_pickle("../data/work/test.pickle")
    else:
        df_test = pd.read_csv("../data/input/key_2.csv")
        df_test['date'] = df_test.Page.apply(lambda a: a[-10:])
        df_test['Page'] = df_test.Page.apply(lambda a: a[:-11])
        df_test['date'] = pd.to_datetime(df_test['date'], format="%Y-%m-%d")
        df_test.to_pickle("../data/work/test.pickle")
    return df_test

# Read data

In [4]:
data = create_train()
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 793 entries, 2015-07-01 to 2017-08-31
Columns: 145063 entries, 2NE1_zh.wikipedia.org_all-access_spider to Francisco_el_matemÃ¡tico_(serie_de_televisiÃ³n_de_2017)_es.wikipedia.org_all-access_spider
dtypes: float64(145063)
memory usage: 877.7 MB
None


Page,2NE1_zh.wikipedia.org_all-access_spider,2PM_zh.wikipedia.org_all-access_spider,3C_zh.wikipedia.org_all-access_spider,4minute_zh.wikipedia.org_all-access_spider,52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,5566_zh.wikipedia.org_all-access_spider,91Days_zh.wikipedia.org_all-access_spider,A'N'D_zh.wikipedia.org_all-access_spider,AKB48_zh.wikipedia.org_all-access_spider,ASCII_zh.wikipedia.org_all-access_spider,...,Drake_(mÃºsico)_es.wikipedia.org_all-access_spider,Skam_(serie_de_televisiÃ³n)_es.wikipedia.org_all-access_spider,LegiÃ³n_(serie_de_televisiÃ³n)_es.wikipedia.org_all-access_spider,Doble_tentaciÃ³n_es.wikipedia.org_all-access_spider,Mi_adorable_maldiciÃ³n_es.wikipedia.org_all-access_spider,Underworld_(serie_de_pelÃ­culas)_es.wikipedia.org_all-access_spider,Resident_Evil:_CapÃ­tulo_Final_es.wikipedia.org_all-access_spider,EnamorÃ¡ndome_de_RamÃ³n_es.wikipedia.org_all-access_spider,Hasta_el_Ãºltimo_hombre_es.wikipedia.org_all-access_spider,Francisco_el_matemÃ¡tico_(serie_de_televisiÃ³n_de_2017)_es.wikipedia.org_all-access_spider
2015-07-01,18.0,11.0,1.0,35.0,,12.0,,118.0,5.0,6.0,...,,,,,,,,,,
2015-07-02,11.0,14.0,0.0,13.0,,7.0,,26.0,23.0,3.0,...,,,,,,,,,,
2015-07-03,5.0,15.0,1.0,10.0,,4.0,,30.0,14.0,5.0,...,,,,,,,,,,
2015-07-04,13.0,18.0,1.0,94.0,,5.0,,24.0,12.0,12.0,...,,,,,,,,,,
2015-07-05,14.0,11.0,0.0,4.0,,20.0,,29.0,9.0,6.0,...,,,,,,,,,,


# Train / Test

In [5]:
## Split in train / test to evaluate scoring 
train = data.iloc[:-60]
test = data.iloc[-60:]
print(train.shape)
print(test.shape)
print(data.shape)

(733, 145063)
(60, 145063)
(793, 145063)


# Median model

## Test

Median model with several windows combination

In [6]:
def add_is_weekend(df):
    return df.assign(is_weekend=lambda x: x.index.dayofweek.isin([0, 1]))

In [7]:
from functools import reduce

Windows = [6, 12, 18, 30, 48, 78, 126, 203, 329]
train = add_is_weekend(train.iloc[-329:])
median_model = []
for x in reversed(Windows):
    median_model.append(train.iloc[-x:].groupby("is_weekend").median().stack())

median_model = reduce(
    lambda x, y: pd.merge(x, y, on=['is_weekend', "Page"], how="outer"),
    [x.reset_index()
     for x in median_model]).set_index(["is_weekend", "Page"]).median(
         axis=1).reset_index().rename(columns={0: "Visits"})

median_model = pd.merge(
    add_is_weekend(test).reset_index(drop=True).melt(
        id_vars="is_weekend").dropna(),
    median_model,
    on=["Page", "is_weekend"],
    how="outer").fillna(0)

print("SMAPE is : ")
print(smape(y_pred=median_model.Visits, y_true=median_model.value))

SMAPE is : 
39.733762183786276


## Submission

In [8]:
df = add_is_weekend(data.iloc[-329:])
median_model_submission = []
for x in reversed(Windows):
    median_model_submission.append(
        df.iloc[-x:].groupby("is_weekend").median().stack())

median_model_submission = reduce(
    lambda x, y: pd.merge(x, y, on=['is_weekend', "Page"], how="outer"), [
        x.reset_index() for x in median_model_submission
    ]).set_index(["is_weekend", "Page"]).median(axis=1).reset_index().rename(
        columns={0: "Visits"})

df_test = create_test()
df_test["is_weekend"] = df_test.date.dt.dayofweek.isin([0, 1])

df_submit = pd.merge(
    df_test, median_model_submission, on=["Page", "is_weekend"],
    how="outer")[["Id", "Visits"]].fillna(0)
df_submit.to_csv(
    "../data/submission/submission_median_weekend_multiple_windows.csv", index=False)
df_submit.head()

Unnamed: 0,Id,Visits
0,0b293039387a,535.0
1,7114389dd824,535.0
2,057b02ff1f09,535.0
3,bd2aca21caa3,535.0
4,c0effb42cdd5,535.0
