In [None]:
import time
t1 = time.time()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import boto3
from zipfile import ZipFile

import statsmodels as sm
import pmdarima as pm
from pmdarima.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

### Get the last file from S3

In [None]:
# Params
bucket_name = 'machinelearning'
path_to_files = 'projets/ml_ige_entrants/data/'
zip_file = './data/data.zip'

In [None]:
# config file for endpoint url in .aws folder
import os
config_file = '.custom_config'
aws_path = os.path.join(os.environ['USERPROFILE'],'.aws',config_file)

with open(aws_path,'r') as f:
    lines = f.readlines()
    
endpoint = [i.split('=')[1].strip() for i in lines if i.startswith('endpoint_url')][0]

In [None]:
# Connect
session = boto3.session.Session()
s3 = session.resource(
    service_name='s3',
    endpoint_url=endpoint,
)

# Get files in folder
bucket = s3.Bucket(bucket_name)
bucket_objects = bucket.objects.filter(Prefix=path_to_files)
unsorted = []
for obj in bucket_objects:
    unsorted.append([obj.key,int(obj.last_modified.strftime("%S"))])

# sort and get the last one
last_file = sorted(unsorted, key=lambda l:l[1], reverse=True)[0][0]

In [None]:
# Download zip
MyObject = s3.Object(bucket_name,last_file)
MyObject.download_file(zip_file)

In [None]:
# Extract zip
with ZipFile(zip_file, 'r') as zf:
    zf.extractall('./data/')

In [None]:
# Needs to add error managemet in case file not existing...
# Add replacement of old files...
# Also add cleanup of temp files (zip)

### Parameters

In [None]:
filename = './data/ml_ige_entrants_ds1.csv'

myfilters = {'SITE':'VELIZY','FLUX_ACTIVITE':'FLUX PRESTATION','SERVICE_ACTIVITE':'PRESTATION'}
# exogs = ['MEDIA']
exogs = []

testsize = .05

scale='W' # 'B','W','SM'
periods = 4

weighted_total = True

max_date='2020-06-29'

### Import csv

In [None]:
%time df = pd.read_csv(filename,sep=';',engine='python',decimal=',') #,index_col='DATEDATA',parse_dates=True)

In [None]:
df

### Preprocess columns

In [None]:
# df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)

if weighted_total:
    df['TOTAL'] = df.TOTAL * df.COEFF
    
df = df.drop(columns=['COEFF'])

In [None]:
df

### Encode exog

In [None]:
def encod_exogs(df, cols):
    dfenc=[]
    concatlist = [df]
    for i,col in enumerate(cols):
        exog_var = df[col].fillna('Unknown').to_numpy().reshape(-1,1)
        encod_fit = LabelBinarizer().fit(exog_var)
        encoded = encod_fit.transform(exog_var)
        dfenc.append(pd.DataFrame(encoded,columns=encod_fit.classes_))
        concatlist.append(dfenc[i])

    dfnew = pd.concat(concatlist,axis=1)
    return dfnew

dfnew = encod_exogs(df,exogs)

### Filter

In [None]:
def filter_df(df, mydict):
    df_filter = df.copy()
    for key,value in mydict.items():
        df_filter = df_filter[df_filter[key]==value]
    return df_filter
    
df_filter = filter_df(dfnew, myfilters)
df_filter = df_filter[df_filter['DATEDATA']<=max_date]

In [None]:
df_filter.shape

### Create Aggregated Df with DateTime Index

In [None]:
df2 = df_filter.groupby('DATEDATA').sum()
df2 = df2.asfreq(freq='B')
df2['TOTAL'].fillna(0,inplace=True)

In [None]:
try:
    df2['ANO BATCH']=df2['ANO BATCH']+df2['ANOS BATCHS']
    df2['EMAIL']=df2['EMAIL']+df2['COURRIEL']
    df2 = df2[df2.index>='2018-08-01'].drop(columns=['ANOS BATCHS','COURRIEL'])
except:
    pass

In [None]:
df2.describe()

In [None]:
df2=df2.resample(scale).sum()

### Split in train test to compare with predictions

In [None]:
train,test = train_test_split(df2, test_size=testsize)

In [None]:
train_endog = train['TOTAL']
train_exog = train.drop(columns=['TOTAL'])

test_endog = test['TOTAL']
test_exog = test.drop(columns=['TOTAL'])

In [None]:
len(train_exog.columns)

In [None]:
# pm.tsdisplay(train_endog, lag_max=20, title="Sunspots", show=True)

In [None]:
# from pmdarima import preprocessing

# y_bc, l = preprocessing.BoxCoxEndogTransformer().fit_transform(train_endog)
# pm.tsdisplay(y_bc, lag_max=20, title="Sunspots (BoxCox-transformed)", show=True)
# print("lambda %s" % l)

### Model auto.arima

In [None]:
if len(train_exog.columns) == 0:
    modl = pm.auto_arima(train_endog, error_action='ignore', trace=True,
                      suppress_warnings=True, maxiter=10,
                      seasonal=True, m=periods)
else:
    modl = pm.auto_arima(train_endog,exogenous=train_exog, error_action='ignore', trace=True,
                      suppress_warnings=True, maxiter=10,
                      seasonal=True, m=periods)

In [None]:
print(modl.order)
print(modl.seasonal_order)

In [None]:
plt.rcParams['figure.figsize'] = [10, 10]
modl.plot_diagnostics();

In [None]:
preds, conf_int = modl.predict(n_periods=test.shape[0],exogenous=test_exog, return_conf_int=True)

### Model HoltWinters

In [None]:
# from statsmodels.tsa.holtwinters import ExponentialSmoothing

In [None]:
# modl2 = ExponentialSmoothing(train_endog, trend='add', damped=False, seasonal='add', seasonal_periods=periods).fit()

In [None]:
# hwpreds = modl2.predict(start=test_endog.index[0], end=test_endog.index[-1])

### Plot Result

In [None]:
graph = pd.DataFrame(test_endog.copy())
graph = graph.rename(columns={'TOTAL':'TEST'})
graph['preds'] = preds
# graph['hwpreds'] = hwpreds
graph['lb'] = conf_int[:,0]
graph['ub'] = conf_int[:,1]
graph = graph.append(pd.DataFrame(train_endog))

In [None]:
graphz = graph[graph.index>'2020-01-01']

In [None]:
plt.rcParams['figure.figsize'] = [15, 10]
g1 = plt.plot(graphz.index, graphz.TOTAL, label='Train')
g2 = plt.plot(graphz.index, graphz.TEST,'c--', label='Test')
g3 = plt.plot(graphz.index, graphz.preds,'r', label='Pred',linewidth=2, alpha=.5)
# g3b = plt.plot(graphz.index, graphz.hwpreds,'g', label='Pred',linewidth=2)
g4 = plt.fill_between(graphz.index,graphz.lb,graphz.ub,color='r',alpha=.2, label='C.I.')
plt.legend(loc='upper left')
plt.title('Actual test samples vs. forecasts')
plt.show()

In [None]:
t2 = time.time()

print('Total Run time: %i secs' % (t2-t1))