In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import datetime
from fbprophet import Prophet

Importing plotly failed. Interactive plots will not work.


In [44]:
df = pd.read_pickle('../../data/time_ecom/dfcatparent.pkl', compression='zip')
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23838 entries, 1465072 to 1454084
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   timestamp      23838 non-null  datetime64[ns]
 1   visitorid      23838 non-null  int64         
 2   event          23838 non-null  object        
 3   itemid         23838 non-null  int64         
 4   transactionid  23838 non-null  float64       
 5   category       23838 non-null  int32         
 6   parent         23838 non-null  int32         
dtypes: datetime64[ns](1), float64(1), int32(2), int64(2), object(1)
memory usage: 1.3+ MB


In [45]:
# for converting to daily frequency
def parent2day(df, select_int, par=True, fb=False):
    """takes DF 23838 with category and parent columns
    outputs daily purchases DF
    select_int is the desired category or parent to be selected
    par=True filters the parent column  //  par=False filters the category column
    fb option toggles 'ds' & 'y' output format for FBprophet
    """
    copy = df.copy()
    # rename for working with FB prophet
    copy.rename(columns={'event':'y'}, inplace=True)
    copy.rename(columns={'timestamp':'ds'}, inplace=True)
    # filter parent
    if par:
        copy = copy[copy['parent'] == select_int].sort_values('ds')
    # filter category
    else:
        copy = copy[copy['category'] == select_int].sort_values('ds')
    # select columns for easier AGG
    copy = copy[['ds', 'y', 'category', 'parent']]
    # resample to daily frequency and count transactions
    dfday = copy.resample('D', on="ds").count() 
    # make and join 139 df to pad missing head and tail
    dr = pd.date_range(start='2015-05-02', end='2015-09-17', freq='D')
    dfmake = pd.DataFrame(index=dr)
    dfmake = dfmake.join(dfday)
    # reset category/parent to be correct
    if par:
        dfmake['parent'] = select_int
    else:
        dfmake['category'] = select_int
    dfmake = dfmake.fillna(0)
    dfmake = dfmake.astype('int32')  #  the join made values floats
    # make ds, y format for FB
    if fb:  
        dfmake = dfmake[['y']].reset_index().rename(columns={'index': 'ds'})
    # remove partial days ( from 139 rows to 137 )
    dfmake = dfmake.iloc[1:-1]
    return dfmake.copy()

In [46]:
def train_prophet(df):
    m = Prophet(changepoint_prior_scale=0.05)
    return m.fit(df)

def predict_horizon(m, horizon=28):
    # dffuture has 'ds' column only
    dffuture = m.make_future_dataframe(periods=horizon)
    dfforecast = m.predict(dffuture)
    """ fb forecast columns
        ['ds', 'trend', 'yhat_lower', 'yhat_upper', 'trend_lower',
       'trend_upper', 'additive_terms', 'additive_terms_lower',
       'additive_terms_upper', 'weekly', 'weekly_lower', 'weekly_upper',
       'multiplicative_terms', 'multiplicative_terms_lower',
       'multiplicative_terms_upper', 'yhat']"""
    y_hat = dfforecast.iloc[-horizon: , -1].values
    return y_hat

def mape(y_true, y_pred):
    # mean absolute percent error
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100.

In [47]:
def daily_df_dict(df, id_array):
    df_dict = {}
    for parent in id_array:
        df_dict[parent] = parent2day(df, parent, par=True, fb=True)
    return df_dict

def shf_error(train_dict, shf_period=-28):
    error_dict = {}
    for parent, df in train_dict.items():
        y_true = df.iloc[shf_period, -1]
        m = train_prophet(df[shf_period:])
        dftime = df[['ds']]
        forecast = m.predict(dftime)
        y_hat = forecast.iloc[shf_period, -1]
        error_dict[parent] = mape(y_true, y_hat)
    return error_dict

In [34]:
topparent = np.array([561, 955, 105, 500, 1095, 805])
df['parent'].nunique()  # 236 
parent_counts = df['parent'].value_counts()

In [35]:
parent_counts[parent_counts > 200].size

37

In [50]:
train_dict = daily_df_dict(df[:-28], topparent)

In [51]:
print(len(train_dict))
train_dict[500][-28:].shape

6


(28, 2)

In [None]:
first_errors = shf_error(train_dict, shf_period=-28)
for parent, error in first_errors.items():
    print(parent, "parent  :  mape error", round(error, 1))
# 561 parent  :  mape error 711.4302802002786
# 955 parent  :  mape error 70.49148698276977
# 105 parent  :  mape error 17.923334238004994
# 500 parent  :  mape error 19.175125123610925
# 1095 parent  :  mape error 165.30957096914483
# 805 parent  :  mape error 51.27614835005032

In [None]:
# master insight function (daily_df, horizon=28)
# { parent : {model: m, } }

In [None]:
# startup examples
Class PurchaseForecast(object):
    
    def __init__(self, modelFile = None):
        self.model = Word2Vec.load(modelFile)
        
    def fit(self, X):
        self.max_size = int(len(max(X, key=len)) * .25)
        
    def _word2idx(self, word):
        if word in self.model.wv.vocab:
            return self.model.wv.vocab[word].index
        return 0