In [None]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from calendar import monthrange
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error
import numpy as np

In [None]:
df = pd.read_csv("https://jedha-final-project-jrat.s3.amazonaws.com/datameteo_france_1950-2022_clean_02.csv")

In [None]:
df_sample = df[df["region"]=="CENTRE VAL DE LOIRE"]

In [None]:
df_sample.info()

In [None]:
df_sample.groupby(["NUM_POSTE"]).count().sort_values(by="NOM_USUEL")

In [None]:
# PREPROCESSING DATE

df_sample["AAAAMM"] = pd.to_datetime(df_sample["AAAAMM"])

def last_day_of_month(aaaamm):
    """
    prend une date et retourne une string avec le jour en dernier jour du mois
    """
    year = aaaamm.year
    month = aaaamm.month
    last_day = monthrange(year, month)[1]
    return f"{year}-{month:02d}-{last_day:02d}"

df_sample["date"] = df_sample["AAAAMM"].apply(last_day_of_month)
df_sample["date"] = pd.to_datetime(df_sample["date"])


In [None]:
to_keep = ["NUM_POSTE","NOM_USUEL","LAT","LON","ALTI","Year","Month","vent_speed_inst_moy_mensu","departement_num","departement_name","region","date"]
to_drop = [col for col in df_sample if col not in to_keep]

In [None]:
to_drop

In [None]:
df_sample = df_sample.drop(to_drop,axis=1)

In [None]:
len(df_sample["NUM_POSTE"].unique())

In [None]:
df_sample.isna().sum()

In [None]:
def get_wind_forecast(time, data):

    """
    INPUT =  nombre de mois (time) à prédire et le set de données

    --> prédit la force du vent à horizon "time"
    
    OUTPUT =  le df de test, les prédictions, la MAE et la MAPE
    """
    # calcule date de split 
    date_limite = data.index.max() - pd.DateOffset(months=time)

    # split train / test
    train = data[data.index <= date_limite]
    test = data[data.index > date_limite]

    # split variables exogènes / target
    to_keep = ["LON","LAT","ALTI"]

    var_exog_train = train[to_keep]
    y_train = train["vent_speed_inst_moy_mensu"]

    var_exog_test = test[to_keep]
    y_test = test["vent_speed_inst_moy_mensu"]

    # Vérifier la présence de NaN
    print(var_exog_train.isna().sum())

    # Vérifier la présence de valeurs infinies
    print(np.isinf(var_exog_train).sum())


    # training
    model = SARIMAX(y_train, exog=var_exog_train, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
    sarima_model = model.fit(maxiter=1000)

    # prédictions
    forecast = sarima_model.get_forecast(steps = time, exog = var_exog_test)
    predicted_values = forecast.predicted_mean

    # évaluation
    mae = mean_absolute_error(y_test, predicted_values)
    mape = mean_absolute_percentage_error(y_test, predicted_values)

    # rendu
    return test, predicted_values, mae, mape


In [None]:
# créé un dictionnaire contenant un dataframe par numéro de station, puis le dataframe d'éval du modèle
# chaque dataset est réindexé sur la date avec fréquence de saisonnalité mensuelle

stations = df_sample["NUM_POSTE"].unique()

dict_df = {}


for station in stations:

    df_station = df_sample[df_sample["NUM_POSTE"]==station]
    df_station.set_index("date", inplace=True)
    df_station = df_station.asfreq("ME")  # 'ME' pour mensuel

    # on garde que les stations qui ont minimum 7 ans de données
    if len(df_station) >= 84:

        df_station_eval, y_pred, mae, mape = get_wind_forecast(84, df_station)
        df_station_eval["pred_7_ans"] = y_pred
        df_station_eval["mae"] = mae
        df_station_eval["mape"] = mape
        dict_df[station] = (df_station, df_station_eval)


    


In [None]:
stations = df_sample["NUM_POSTE"].unique()

dict_df = {}
count_84 = 0
count_NA = 0

for station in stations:


    df_station = df_sample[df_sample["NUM_POSTE"]==station]
    df_station.set_index("date", inplace=True)
    df_station = df_station.asfreq("ME")  # 'ME' pour mensuel
    
    if len(df_station)-1 >= 84:
        print(f"Station: {station}")
        print(f"nb_mois = {len(df_station)-1}")
        print()
        count_84 += 1
        if df_station.isna().any().any() == True:
            count_NA += 1

print(f"total station avec mois > 84: {count_84}")
print(f"total station > 84 avec NA: {count_NA}")

In [None]:
print(len(df_sample))

