In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import statsmodels as sm



### Import csv

In [None]:
filename = './data/ML_IGE_ENTRANTS_MODELE1.csv'

In [None]:
df = pd.read_csv(filename,sep=';',engine='python',index_col='DATEDATA',parse_dates=True)

In [None]:
df.head()

In [None]:
df.info()

### preprocess columns

In [None]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
df[:10]

### Filter

In [None]:
def filter_df(df, mydict):
    df_filter = df.copy()
    for key,value in mydict.items():
        df_filter = df_filter[df_filter[key]==value]
    return df_filter
    
    
myfilters = {'SITE':'VELIZY','FLUX_ACTIVITE':'Flux Prestation','SERVICE_ACTIVITE':'PRESTATION','MEDIA':'Courrier'}
df_filter = filter_df(df, myfilters)

In [None]:
df_filter

### Create Aggregated Df with DatTime Index

In [None]:
df2 = df_filter.groupby('DATEDATA').sum()
df2 = df2.asfreq(freq='B')

In [None]:
df2

### Split in train test to compare with predictions

In [None]:
from pmdarima.model_selection import train_test_split

testsize = 90

train,test = train_test_split(df2, test_size=testsize)

### Look at train dataset

In [None]:
plt.rcParams['figure.figsize'] = [8, 5]
train.plot();

In [None]:
# from scipy.stats import boxcox

# df3, l = boxcox(train['TOTAL'])

In [None]:
train.info()

In [None]:
train = train.copy()
train['TOTAL'].fillna(0,inplace=True)

train.info()

In [None]:
test = test.copy()
test['TOTAL'].fillna(0,inplace=True)

### Decompose

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
res = seasonal_decompose(train.TOTAL.interpolate(),period=5, model='add')
plt.rcParams['figure.figsize'] = [15, 10]
resplot = res.plot()

### ACF / PACF

In [None]:
from pmdarima import utils
plt.rcParams['figure.figsize'] = [8, 5]
utils.plot_acf(train)
utils.plot_pacf(train)

In [None]:
# Fit a simple auto_arima model
import pmdarima as pm
modl = pm.auto_arima(train, error_action='ignore', trace=True,
                      suppress_warnings=True, maxiter=10,
                      seasonal=True, m=5)

In [None]:
modl.summary()

In [None]:
plt.plot(modl.resid())

In [None]:
from sklearn.metrics import mean_squared_error
# Create predictions for the future, evaluate on test
preds, conf_int = modl.predict(n_periods=test.shape[0], return_conf_int=True)

# Print the error:
print("Test RMSE: %.3f" % np.sqrt(mean_squared_error(test, preds)))

In [None]:
graph = test.copy()
graph = graph.rename(columns={'TOTAL':'TEST'})
graph['preds'] = preds
graph['lb'] = conf_int[:,0]
graph['ub'] = conf_int[:,1]
graph = graph.append(train)

In [None]:
# graph = graph.sort_index()
graphz = graph[graph.index>'2018-06-01']

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]
plt.plot(graphz.index, graphz.TOTAL)
plt.plot(graphz.index, graphz.TEST)
plt.plot(graphz.index, graphz.preds)
plt.fill_between(graphz.index,graphz.lb,graphz.ub,alpha=.2)
plt.title('Actual test samples vs. forecasts')
plt.show()