In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import statsmodels as sm
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

### Parameters

In [None]:
filename = './data/ML_IGE_ENTRANTS_MODELE1.csv'

# myfilters = {'SITE':'VELIZY','FLUX_ACTIVITE':'Flux Prestation','SERVICE_ACTIVITE':'PRESTATION','MEDIA':'Courrier'}
myfilters = {'SITE':'VELIZY','FLUX_ACTIVITE':'FLUX PRESTATION','SERVICE_ACTIVITE':'PRESTATION'}

testsize = 0.05

scale='B' # 'B','W','SM'

### Import csv

In [None]:
df = pd.read_csv(filename,sep=';',engine='python') #,index_col='DATEDATA',parse_dates=True)

In [None]:
df

### Preprocess columns

In [None]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)

### Encode exog

In [None]:
def encod_exogs(df, cols):
    dfenc=[]
    concatlist = [df]
    for i,col in enumerate(cols):
        exog_var = df[col].fillna('Unknown').to_numpy().reshape(-1,1)
        encod_fit = LabelBinarizer().fit(exog_var)
        encoded = encod_fit.transform(exog_var)
        dfenc.append(pd.DataFrame(encoded,columns=encod_fit.classes_))
        concatlist.append(dfenc[i])

    dfnew = pd.concat(concatlist,axis=1)
    return dfnew

dfnew = encod_exogs(df,['MEDIA'])

### Filter

In [None]:
def filter_df(df, mydict):
    df_filter = df.copy()
    for key,value in mydict.items():
        df_filter = df_filter[df_filter[key]==value]
    return df_filter
    
df_filter = filter_df(dfnew, myfilters)

In [None]:
df_filter.shape

### Create Aggregated Df with DateTime Index

In [None]:
df2 = df_filter.groupby('DATEDATA').sum()
df2 = df2.asfreq(freq='B')
df2['TOTAL'].fillna(0,inplace=True)

In [None]:
df2['ANO BATCH']=df2['ANO BATCH']+df2['ANOS BATCHS']
df2['EMAIL']=df2['EMAIL']+df2['COURRIEL']
df2 = df2[df2.index>='2018-08-01'].drop(columns=['ANOS BATCHS','COURRIEL'])

In [None]:
df2.describe()

In [None]:
df2=df2.resample(scale).sum()

### Split in train test to compare with predictions

In [None]:
train,test = train_test_split(df2, test_size=testsize)

In [None]:
train_endog = train['TOTAL']
train_exog = train.drop(columns='TOTAL')

test_endog = test['TOTAL']
test_exog = test.drop(columns='TOTAL')

In [None]:
pm.tsdisplay(train_endog, lag_max=20, title="Sunspots", show=True)

In [None]:
from pmdarima import preprocessing

y_bc, l = preprocessing.BoxCoxEndogTransformer().fit_transform(train_endog)
pm.tsdisplay(y_bc, lag_max=20, title="Sunspots (BoxCox-transformed)", show=True)
print("lambda %s" % l)

### Model auto.arima

In [None]:
modl = pm.auto_arima(train_endog,exogenous=train_exog, error_action='ignore', trace=True,
                      suppress_warnings=True, maxiter=10,
                      seasonal=True, m=5)

In [None]:
print(modl.order)
print(modl.seasonal_order)

In [None]:
plt.rcParams['figure.figsize'] = [10, 10]
modl.plot_diagnostics();

In [None]:
preds, conf_int = modl.predict(n_periods=test.shape[0],exogenous=test_exog, return_conf_int=True)

### Model HoltWinters

In [None]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
modl2 = ExponentialSmoothing(train_endog, trend='add', damped=False, seasonal='add', seasonal_periods=5).fit()

In [None]:
hwpreds = modl2.predict(start=test_endog.index[0], end=test_endog.index[-1])

### VAR Model

In [None]:
trainnew = np.log(train.TOTAL).diff().dropna() # A TESTER !

In [None]:
# pm.tsdisplay(trainnew, lag_max=20, title="Sunspots", show=True)

In [None]:
train_new = train[['TOTAL','COURRIER','EMAIL']]
train_new = np.log(train_new.replace(0, np.nan)).diff().dropna()

In [None]:
train_new = train_new.asfreq(scale)

In [None]:
train_new.fillna(0).info()

In [None]:
from statsmodels.tsa.vector_ar.var_model import VAR
modl3 = VAR(train_new).fit(verbose=True)

In [None]:
modl3.summary()

In [None]:
pred3 = modl3.predict(start=test_endog.index[0], end=test_endog.index[-1])

### Plot Result

In [None]:
graph = pd.DataFrame(test_endog.copy())
graph = graph.rename(columns={'TOTAL':'TEST'})
graph['preds'] = preds
# graph['hwpreds'] = hwpreds
graph['lb'] = conf_int[:,0]
graph['ub'] = conf_int[:,1]
graph = graph.append(pd.DataFrame(train_endog))

In [None]:
graphz = graph[graph.index>'2020-03-01']

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]
g1 = plt.plot(graphz.index, graphz.TOTAL, label='Train')
g2 = plt.plot(graphz.index, graphz.TEST,'c--', label='Test')
g3 = plt.plot(graphz.index, graphz.preds,'r', label='Pred',linewidth=2, alpha=.5)
# g3b = plt.plot(graphz.index, graphz.hwpreds,'g', label='Pred',linewidth=2)
g4 = plt.fill_between(graphz.index,graphz.lb,graphz.ub,color='r',alpha=.2, label='C.I.')
plt.legend(loc='upper left')
plt.title('Actual test samples vs. forecasts')
plt.show()