**<font size="6"> Prédictions des modèles SARIMA retenus </font>**

# Imports des modules et fonctions nécessaires

In [1]:
import os
import re
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from pylab import rcParams
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import mean_squared_error
import itertools
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning) # Permet d'éviter l'affichage des Warnings

# Chargement et nettoyage des données

In [2]:
df = pd.read_pickle("data/df_train.pkl")
df_ACE = df[df['filename']=='champs-elysees.csv']
df_Sts = df[df['filename']=='convention.csv']
df_convention = df[df['filename']=='sts.csv']

df_test = pd.read_pickle("data/df_test.pkl")
df_ACE_test = df_test[df_test['filename']=='champs-elysees.csv']
df_Sts_test = df_test[df_test['filename']=='convention.csv']
df_convention_test = df_test[df_test['filename']=='sts.csv']


dfs = [df_ACE, df_Sts, df_convention]
dfs_test = [df_ACE_test, df_Sts_test, df_convention_test]

## Traitement des dates

In [3]:
# Format d'une série temporelle
for df in dfs:
    df['Date et heure de comptage']= pd.to_datetime(df["Date et heure de comptage"], format='%Y-%m-%d %H:%M:%S')
    
# for df in dfs_test:
#     df['Date et heure de comptage']= pd.to_datetime(df["Date et heure de comptage"], format='%Y-%m-%d %H:%M:%S')
    
# Lignes rangées par ordre chronologique
for df in dfs:
    df.sort_values("Date et heure de comptage",inplace = True)
    
for df in dfs_test:
    df.sort_values("Date et heure de comptage",inplace = True)

def index_and_freq(df):
    df = df.set_index('Date et heure de comptage')
    df = df.asfreq('H', method= 'ffill')
    return df

tagg = ['Champs-Élysées', 'Convention', 'Sts-Pères']
    
# Nous indiquons que la série temporelle est indexée selon la date
df_ACE = index_and_freq(df_ACE)
df_convention = index_and_freq(df_convention)
df_Sts = index_and_freq(df_Sts)

df_ACE_test = index_and_freq(df_ACE_test)
df_Sts_test = index_and_freq(df_Sts_test)
df_convention_test = index_and_freq(df_convention_test)


In [4]:
df_ACE['Débit horaire']

Date et heure de comptage
2019-11-01 04:00:00     746.0
2019-11-01 05:00:00     549.0
2019-11-01 06:00:00     503.0
2019-11-01 07:00:00     500.0
2019-11-01 08:00:00     525.0
                        ...  
2020-12-10 20:00:00    1038.0
2020-12-10 21:00:00     699.0
2020-12-10 22:00:00     474.0
2020-12-10 23:00:00     403.0
2020-12-11 00:00:00     247.0
Freq: H, Name: Débit horaire, Length: 9741, dtype: float64

# Utils

In [5]:
def create_df(original_df):
    data = pd.DataFrame()
    data['debit'] = original_df['Débit horaire']
    data['taux'] = original_df["Taux d'occupation"]
    data['date'] = original_df.index
    data.sort_values("date",inplace = True)
    data = data.set_index('date')
    data = data.asfreq('H', method= 'ffill')
    return data

def string_to_model(model):
    p = int(model[1])
    q = int(model[4])
    d = int(model[7])
    seasonal = (int(model[10]), int(model[13]), int(model[16]), int(model[19:21]))
    return p,q,d,seasonal



# Modèles Retenus

In [6]:
# 0: Champs Elysees
# 1: Sts Peres
# 2: Convention

modeles_debit={0: '(4, 1, 5)(4, 1, 4, 24)', 2: '(4, 1, 5)(4, 1, 4, 24)', 1:'(4, 1, 5)(4, 1, 4, 24)'}
modeles_taux={0: '(1, 0, 1)(2, 1, 1, 24)', 2: '(1, 0, 1)(2, 1, 1, 24)', 1:'(2, 0, 2)(1, 1, 1, 24)'}

In [7]:
def plotting_predictions(X_train, pred, title):
    plt.plot(X_train, label='Données')
    plt.plot(pred, label='Prediction')
    plt.xticks(rotation=45)
    plt.title(title)
    plt.legend()
    plt.show()

# Géneration des prédictions

In [None]:
df_predictions = pd.DataFrame()

for model_index,df in tqdm(enumerate([df_ACE, df_Sts, df_convention])):
    
    # Création des timeseries pour débit horaire et taux d'occupation
    debit_train = create_df(df).loc['2020-08-01':'2020-12-11 00:00:00'].debit
    debit_plot = create_df(df).loc['2020-11-10':'2020-12-11 00:00:00'].debit
    taux_train = create_df(df).loc['2020-08-01':'2020-12-11 00:00:00'].taux
    taux_plot = create_df(df).loc['2020-11-10':'2020-12-11 00:00:00'].taux
    
    # Dates de prédiction
    start = datetime.strptime('2020-12-11 01:00:00', '%Y-%m-%d %H:%M:%S')
    end = datetime.strptime('2020-12-16 23:00:00', '%Y-%m-%d %H:%M:%S')
    
    # Paramètres des modèles
    p_debit,d_debit,q_debit,seasonal_debit = string_to_model(modeles_debit[model_index])
    p_taux,d_taux,q_taux,seasonal_taux = string_to_model(modeles_taux[model_index])
        
    # Ajustement du modèle pour le débit horaire
    mod_debit = sm.tsa.statespace.SARIMAX(debit_train,order=(p_debit,d_debit,q_debit),seasonal_order=seasonal_debit,enforce_stationarity=False,enforce_invertibility=False)
    model_debit = mod_debit.fit()
    debit_pred = model_debit.predict(start=start,end=end)
    
    # Ajustement du modèle pour le taux d'occupation
    mod_taux = sm.tsa.statespace.SARIMAX(taux_train,order=(p_taux,d_taux,q_taux),seasonal_order=seasonal_taux,enforce_stationarity=False,enforce_invertibility=False)
    model_taux = mod_taux.fit()
    taux_pred = model_taux.predict(start=start,end=end)
    
    # Rajout des données au DataFrame
    temp_df = pd.DataFrame()
    temp_df['Datetime'] = taux_pred.index
    temp_df['Arc'] = [tagg[model_index] for i in range(len(taux_pred))]
    temp_df["Débit horaire"] = debit_pred.values
    temp_df["Taux d'occupation"] = taux_pred.values
    
    # Rajout des données de minuit (une ligne)
    first_hour = pd.DataFrame()
    first_hour['Datetime'] = [taux_train.index[-1]]
    first_hour['Arc'] = [tagg[model_index]]
    first_hour["Taux d'occupation"] = [taux_train.values[-1]]
    first_hour["Débit horaire"] = [debit_train.values[-1]]
    
    temp_df = temp_df.append(first_hour)
    temp_df.sort_values("Datetime",inplace = True)
    
    df_predictions = pd.concat([df_predictions, temp_df])
        
    rcParams['figure.figsize'] = 12, 4
    plotting_predictions(debit_plot, debit_pred, "Débit horaire " + tagg[model_index])
    plotting_predictions(taux_train, taux_pred, "Taux d'occupation " + tagg[model_index])
    
    
    
    

0it [00:00, ?it/s]

In [None]:
df_predictions.to_csv('output.csv', index=False)