# Library

In [1]:
import pandas as pd
import numpy as np
import re
import pickle
import os

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from fbprophet import Prophet
from joblib import Parallel, delayed
import multiprocessing

In [2]:
def temp_func(func, name, group):
    return func(group), name


def applyParallel(dfGrouped, func):
    retLst, top_index = zip(
        *Parallel(n_jobs=multiprocessing.cpu_count()-1)(delayed(temp_func)(
            func, name, group) for name, group in dfGrouped))
    return pd.concat(retLst, keys=top_index)

# Scoring functions

In [3]:
def smape(y_true, y_pred):
    """
    Scoring function
    """
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return 100 * np.mean(diff)


def smape_serie(x):
    """
    Scoring function on serie
    """
    return smape(y_pred=x.Visits, y_true=x.value)


# Helping functions

In [4]:
def create_train():
    if os.path.isfile("../data/work/train.pickle"):
        data = pd.read_pickle("../data/work/train.pickle")
    else:
        data = pd.read_csv('../data/input/train_2.csv')
        cols = data.columns[data.columns.str.contains("-")].tolist()
        data["Page"] = data["Page"].astype(str)
        data = data.set_index("Page").T
        data.index = pd.to_datetime(data.index, format="%Y-%m-%d")
        data.to_pickle("../data/work/train.pickle")
    return data


def create_test():
    if os.path.isfile("../data/work/test.pickle"):
        df_test = pd.read_pickle("../data/work/test.pickle")
    else:
        df_test = pd.read_csv("../data/input/key_2.csv")
        df_test['date'] = df_test.Page.apply(lambda a: a[-10:])
        df_test['Page'] = df_test.Page.apply(lambda a: a[:-11])
        df_test['date'] = pd.to_datetime(df_test['date'], format="%Y-%m-%d")
        df_test.to_pickle("../data/work/test.pickle")
    return df_test

# Read data

In [5]:
data = create_train()
print(data.info())
data.head()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 793 entries, 2015-07-01 to 2017-08-31
Columns: 145063 entries, 2NE1_zh.wikipedia.org_all-access_spider to Francisco_el_matemÃ¡tico_(serie_de_televisiÃ³n_de_2017)_es.wikipedia.org_all-access_spider
dtypes: float64(145063)
memory usage: 877.7 MB
None


Page,2NE1_zh.wikipedia.org_all-access_spider,2PM_zh.wikipedia.org_all-access_spider,3C_zh.wikipedia.org_all-access_spider,4minute_zh.wikipedia.org_all-access_spider,52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,5566_zh.wikipedia.org_all-access_spider,91Days_zh.wikipedia.org_all-access_spider,A'N'D_zh.wikipedia.org_all-access_spider,AKB48_zh.wikipedia.org_all-access_spider,ASCII_zh.wikipedia.org_all-access_spider,...,Drake_(mÃºsico)_es.wikipedia.org_all-access_spider,Skam_(serie_de_televisiÃ³n)_es.wikipedia.org_all-access_spider,LegiÃ³n_(serie_de_televisiÃ³n)_es.wikipedia.org_all-access_spider,Doble_tentaciÃ³n_es.wikipedia.org_all-access_spider,Mi_adorable_maldiciÃ³n_es.wikipedia.org_all-access_spider,Underworld_(serie_de_pelÃ­culas)_es.wikipedia.org_all-access_spider,Resident_Evil:_CapÃ­tulo_Final_es.wikipedia.org_all-access_spider,EnamorÃ¡ndome_de_RamÃ³n_es.wikipedia.org_all-access_spider,Hasta_el_Ãºltimo_hombre_es.wikipedia.org_all-access_spider,Francisco_el_matemÃ¡tico_(serie_de_televisiÃ³n_de_2017)_es.wikipedia.org_all-access_spider
2015-07-01,18.0,11.0,1.0,35.0,,12.0,,118.0,5.0,6.0,...,,,,,,,,,,
2015-07-02,11.0,14.0,0.0,13.0,,7.0,,26.0,23.0,3.0,...,,,,,,,,,,
2015-07-03,5.0,15.0,1.0,10.0,,4.0,,30.0,14.0,5.0,...,,,,,,,,,,
2015-07-04,13.0,18.0,1.0,94.0,,5.0,,24.0,12.0,12.0,...,,,,,,,,,,
2015-07-05,14.0,11.0,0.0,4.0,,20.0,,29.0,9.0,6.0,...,,,,,,,,,,


# Train / Test

In [6]:
## Split in train / test to evaluate scoring 
train = data.iloc[:-60]
test = data.iloc[-60:]
print(train.shape)
print(test.shape)
print(data.shape)

(733, 145063)
(60, 145063)
(793, 145063)


# Prophet

In [7]:
def prophet_forecast(df):
    return Prophet(
        yearly_seasonality=False,
        daily_seasonality=False,
        weekly_seasonality="auto",
        seasonality_prior_scale=5,
        changepoint_prior_scale=0.5).fit(df.dropna()).predict(df_predict)[[
            "ds", "yhat"
        ]]

## Test

In [8]:
df_predict = pd.DataFrame({"ds": test.index})
df_predict.head()

Unnamed: 0,ds
0,2017-07-03
1,2017-07-04
2,2017-07-05
3,2017-07-06
4,2017-07-07


In [9]:
# page_sample = train.columns[np.random.randint(0, len(train.columns), 10)]

# train_sample = train[page_sample].reset_index().rename(
#     columns={"index": "ds"}).melt(id_vars="ds").rename(columns={"value":
#                                                                 "y"}).dropna()
# test_sample = test[page_sample]

# train_sample.head()

Unnamed: 0,ds,Page,y
0,2015-07-01,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,84.0
1,2015-07-02,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,53.0
2,2015-07-03,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,54.0
3,2015-07-04,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,67.0
4,2015-07-05,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,84.0


In [10]:
forecast = applyParallel(train.groupby("Page"),
                         prophet_forecast).reset_index().rename(
                             columns={"level_0": "Page"}).drop(
                                 "level_1", axis=1)
forecast.head()

Unnamed: 0,Page,ds,yhat
0,ConstituciÃ³n_de_la_NaciÃ³n_Argentina_es.wikip...,2017-07-03,1717.25328
1,ConstituciÃ³n_de_la_NaciÃ³n_Argentina_es.wikip...,2017-07-04,1025.85255
2,ConstituciÃ³n_de_la_NaciÃ³n_Argentina_es.wikip...,2017-07-05,1026.109014
3,ConstituciÃ³n_de_la_NaciÃ³n_Argentina_es.wikip...,2017-07-06,973.858243
4,ConstituciÃ³n_de_la_NaciÃ³n_Argentina_es.wikip...,2017-07-07,714.02224


In [11]:
forecast = pd.merge(
    test_sample.reset_index().rename(columns={"index": "ds"}).melt(
        id_vars="ds"),
    forecast,
    on=["ds", "Page"],
    how="inner")
forecast.head()

Unnamed: 0,ds,Page,value,yhat
0,2017-07-03,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,128.0,113.503208
1,2017-07-04,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,109.0,94.62018
2,2017-07-05,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,136.0,84.884688
3,2017-07-06,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,136.0,84.399169
4,2017-07-07,é»åç«_(é¦æ¸¯)_zh.wikipedia.org_all-access...,162.0,97.387656


In [12]:
print("SMAPE is : ")
print(smape(y_pred=forecast["value"], y_true=forecast["yhat"]))

SMAPE is : 
81.98447484169223
