# Imports básicos para todas as análises

In [11]:
# import os
# Verificando se isso aqui resolve o CUDAOutOfMemory 
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# import pickle
# import hydroeval as hev
# from permetrics import RegressionMetric

import os, re, json, torch, warnings

import mlforecast as mlf, numpy as np, optuna as opt, pandas as pd, plotly.graph_objects as go

from datetime import datetime
from functools import partial
from sklearn.model_selection import TimeSeriesSplit
from plotly.subplots import make_subplots

from statsforecast import StatsForecast
from statsforecast.models import SeasonalNaive

from mlforecast.utils import PredictionIntervals
from sklearn.tree import DecisionTreeRegressor

from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM #, GRU
from neuralforecast.losses.pytorch import HuberMQLoss #, SMAPE, MQLoss

# A ser usado apenas para a análise de imputação de dados (ao invés de sempre aplicar o valor médio)
from sklearn.impute import KNNImputer

from sktime.param_est.seasonality import SeasonalityACF
from sktime.param_est.stationarity import StationarityADF
from sktime.performance_metrics.forecasting import (
    MeanAbsolutePercentageError,
    MeanSquaredError,
)

# from sktime.split import temporal_train_test_split
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import (
    acf,
    pacf
)

# Desativar as mensagens de 'warning' que ficam poluindo o output de alguns trechos de código.
warnings.filterwarnings("ignore")

# Para com a verborragia do log do Optuna
opt.logging.set_verbosity(opt.logging.WARNING)

# Wraper pra usar a engine do Plotly ao invocar a função "[DataFrame|Series].plot" do Pandas
pd.options.plotting.backend = "plotly"

# Reduz a precisão na multiplicação de matrizes, mas aumenta o desempenho e consome menos memória da GPU
# <https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision>
torch.set_float32_matmul_precision("highest")

# Métricas utilizadas
smape = MeanAbsolutePercentageError(symmetric=True) # Melhor valor possível é 0.0
rmse = MeanSquaredError(square_root=True) # Quanto menor, melhor

SALVAR_PLOTS = True
SEED = 1984

pasta_resultados = "./resultados/trecho_alto/"

# Utilidades

In [12]:
# Métricas comumente aplicadas à Hidrologia

def kling_gupta_efficiency(y_true, y_pred):
    """
        Calcula a métrica Kling-Gupta Efficiency (KGE).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor da métrica KGE.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
    
    # Correlação linear
    r = np.corrcoef(y_true, y_pred)[0, 1]
    
    # Razão dos desvios padrão (beta)
    std_true = np.std(y_true, ddof=1)
    std_pred = np.std(y_pred, ddof=1)
    beta = std_pred / std_true
    
    # Razão das médias (gamma)
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    gamma = mean_pred / mean_true
    
    # Cálculo do KGE
    kge = 1 - np.sqrt(
        ((r - 1) ** 2) +
        ((beta - 1) ** 2) +
        ((gamma - 1) ** 2)
    )
    
    return kge
##############################################################################################
def kling_gupta_efficiency_non_parametric(y_true, y_pred):
    """
        Código retirado de: <https://github.com/ThibHlln/hydroeval/blob/main/hydroeval/objective_functions.py>

        Calcula a métrica Kling-Gupta Efficiency não-paramétrica (KGEnp).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Traditional Kling-Gupta efficiencies (Gupta et al., 2009; Kling et al., 2012) range from -Inf to 1, and therefore KGEnp should do so.
        Essentially, the closer to 1, the more similar 'y_pred' and 'y_true' are.
        Knoben et al. (2019) showed that traditional Kling-Gupta (Gupta et al., 2009; Kling et al., 2012)
            values greater than -0.41 indicate that a model improves upon the mean flow benchmark, even if the model's KGE value is negative.

            Texto retirado de: <https://rdrr.io/cran/hydroGOF/man/KGEnp.html>

    """

    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred) 

    # calculate error in timing and dynamics r
    # (Spearman's correlation coefficient)
    sim_rank = np.argsort(np.argsort(y_pred, axis=0), axis=0)
    obs_rank = np.argsort(np.argsort(y_true, axis=0), axis=0)

    r_num = np.sum(
        (obs_rank - np.mean(obs_rank, axis=0, dtype=np.float64)) *
        (sim_rank - np.mean(sim_rank, axis=0, dtype=np.float64)),
        axis=0
    )

    r_den = np.sqrt(
        np.sum(
            (obs_rank - np.mean(obs_rank, axis=0, dtype=np.float64)) ** 2,
            axis=0
        ) *
        np.sum(
            (sim_rank - np.mean(sim_rank, axis=0, dtype=np.float64)) ** 2,
            axis=0
        )
    )

    r = r_num / r_den

    # calculate error in timing and dynamics alpha (flow duration curve)
    sim_fdc = np.sort(
        y_pred / (y_pred.shape[0] * np.mean(y_pred, axis=0, dtype=np.float64)),
        axis=0
    )

    obs_fdc = np.sort(
        y_true / (y_true.shape[0] * np.mean(y_true, axis=0, dtype=np.float64)),
        axis=0
    )

    alpha = 1 - 0.5 * np.sum(np.abs(sim_fdc - obs_fdc), axis=0)

    # calculate error in volume beta (bias of mean discharge)
    beta = (np.mean(y_pred, axis=0) / np.mean(y_true, axis=0, dtype=np.float64))

    # calculate the non-parametric Kling-Gupta Efficiency KGEnp
    kgenp = 1 - np.sqrt(
        ((r - 1) ** 2) +
        ((alpha - 1) ** 2) +
        ((beta - 1) ** 2)
    )

    return kgenp
##############################################################################################
def nash_sutcliffe_efficiency(y_true, y_pred):
    """
        Calcula a métrica Nash-Sutcliffe Efficiency (NSE).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor da métrica NSE.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
    
    # Média dos valores observados
    mean_y_true = np.mean(y_true)
    
    # Soma dos quadrados dos resíduos
    ss_res = np.sum((y_true - y_pred) ** 2)
    
    # Soma dos quadrados totais
    ss_tot = np.sum((y_true - mean_y_true) ** 2)
    
    # Cálculo do NSE
    nse = 1 - (ss_res / ss_tot)
    
    return nse
##############################################################################################
def coefficient_determination(y_true, y_pred):
    """
        Calcula o Coeficiente de Determinação (R²).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor do R².
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
    
    # Média dos valores observados
    mean_y_true = np.mean(y_true)
    
    # Soma dos quadrados dos resíduos
    ss_res = np.sum((y_true - y_pred) ** 2)
    
    # Soma dos quadrados totais
    ss_tot = np.sum((y_true - mean_y_true) ** 2)
    
    # Cálculo do R²
    r2 = 1 - (ss_res / ss_tot)
    
    return r2
##############################################################################################
def percentage_bias(y_true, y_pred):
    """
        O Viés Percentual - Percentual Bias (PBIAS) mede a tendência média dos valores simulados de serem maiores ou menores que os observados.
        O valor ideal de PBIAS é 0.0, com valores de baixa magnitude indicando simulação precisa do modelo.
        Valores positivos indicam viés de *SUPERESTIMAÇÃO*
        Valores negativos indicam viés de *SUBESTIMAÇÃO*

        Fonte: <https://search.r-project.org/CRAN/refmans/hydroGOF/html/pbias.html>

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor do PBIAS em porcentagem.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
  
    pbias = (np.sum(y_pred - y_true) / np.sum(y_true)) * 100
    
    return pbias
##############################################################################################
def deviation_runoff_volume(y_true, y_pred):
    """
        Calcula o Deviation of the Runoff Volumes
        Valor próximo de 1.0 indica que o modelo está indo bem nas previsões

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor do DRV.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
  
    drv = np.sum(y_pred) / np.sum(y_true)
    
    return drv

In [13]:
def plot_serie_temporal(
    dataset: pd.DataFrame,
    coluna : str,
    tp_coluna : bool,
    plot_title: str,
    line_color: str,
    short_name: str,
    salvar: bool = False,
) -> None:
    """
        Método para desenhar o gráfico da Série Temporal completa.

        Parâmetros:
            dataset: o DataFrame com os dados da série temporal para desenhar o gráfico

            coluna: a coluna para a qual se deseja gerar o gráfico

            tp_coluna: é um campo do tipo booleano que faz a distinção entre "vazão" e "chuva" (os dois únicos tipos de séries temporais que temos)
                True -> significa "vazão"
                False -> significa "chuva"

            plot_title: uma string com um título para o gráfico

            line_color: uma cor desejada para a linha desenhada da série temporal

            short_name: uma string que será usada para escrever a legenda do gráfico
            
            salvar: se deverá salvar o gráfico em disco (quando True) ou desenhar na tela (quando False)
    """

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=dataset["ds"],
            y=dataset[coluna],
            mode="lines",
            name=short_name,
            line=dict(
                color=line_color,
                width=2
            ),
        ),
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s)" if tp_coluna else "Precipitação (mm/dia)",
            font=dict(
                family="system-ui",
                size=18
            )
        ),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(
                family="system-ui",
                size=18
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=1000,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text=plot_title,
            font=dict(
                family="system-ui",
                size=24
            )
        ),
    )

    if salvar:
        now = datetime.now()
        fig.write_image(
            pasta_resultados+"SérieTemporal_col[{col}]_{dt}.png".format(
                col=coluna,
                dt=now.strftime("%Y-%m-%d_%H-%M-%S")
            )
        )
    else:
        fig.show()
# ============================================================================================ #
def plot_cv(
        fh : int,
        df_merged : pd.DataFrame,
        df_resultado : pd.DataFrame,
        regressor : str,
        data_inicio : str,
        n_decimais : int,
        titulo_plot : str,
        pasta_dstn : str,
        salvar: bool = False,
) -> None:

    fig = go.Figure()

    fig = make_subplots(
        rows=2,
        cols=1,
        vertical_spacing=0.2,
        specs=[
            [{"type": "scatter"}],
            [{"type": "table"}]
        ],
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged[df_merged['ds'] >= data_inicio]["ds"],
            y=df_merged[df_merged['ds'] >= data_inicio]["y"],
            mode="lines+markers",
            name="Observado",
            line=dict(
                color="#000000",
                width=2
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged[df_merged['ds'] >= data_inicio]["ds"],
            y=df_merged[df_merged['ds'] >= data_inicio][regressor],
            mode="lines+markers",
            name="Previsão",
            line=dict(color="red"),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Table(
            header=dict(
                values=[
                    "SMAPE",
                    "RMSE",
                    "PBIAS (%)",
                    "DRV",
                ],
                font=dict(size=18),
                align="center"
            ),
            cells=dict(
                values=[
                    round(smape(df_resultado.y, df_resultado[regressor]), n_decimais),
                    round(rmse(df_resultado.y, df_resultado[regressor]), n_decimais),
                    round(percentage_bias(df_resultado.y, df_resultado[regressor]), n_decimais),
                    round(deviation_runoff_volume(df_resultado.y, df_resultado[regressor]), n_decimais),
                ],
                font=dict(size=18),
                height=30,
                align="left",
            ),
        ),
        row=2,
        col=1,
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s)",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=1000,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text=titulo_plot,
            font=dict(
                family="system-ui",
                size=30
            )
        ),
    )

    if salvar:
        if not os.path.exists(pasta_dstn):
            os.makedirs(pasta_dstn)

        now = datetime.now()
        fig.write_image(
            pasta_dstn+"/cv_{reg}_fh{fh}_{dt}.png".format(
                reg=regressor,
                fh=fh,
                dt=now.strftime("%Y-%m-%d_%H-%M-%S")
            )
        )
    else:
        fig.show()
# ============================================================================================ #
def decomp_series(
    df: pd.DataFrame,
    tendencia: bool,
    sazonalidade: bool,
    residuo: bool,
    salvar: bool = False,
) -> None:
    # A decomposição das séries temporais ajuda a detectar padrões (tendência, sazonalidade)
    #   e identificar outras informações que podem ajudar na interpretação do que está acontecendo.

    cols = df.drop(columns=["ds", "unique_id"]).columns.to_list()
    for c in cols:
        
        # Utilizei modelo do tipo "add" (aditivo) pois tem séries com valores 0 (zero).
        # Período de 365 dias porque o que me interessa é capturar comportamentos anuais.
        decomp = seasonal_decompose(
            df[c],
            period=365, # 365 dias = 1 ano
            model="add"
        )
        fig_decomp = make_subplots(specs=[[{"secondary_y": True}]])

        fig_decomp.add_trace(
            go.Scatter(
                x=df.ds,
                y=decomp.observed,
                name="observado",
                mode="lines",
                showlegend=True,
            ),
            secondary_y=False,
        )

        if tendencia:
            fig_decomp.add_trace(
                go.Scatter(
                    x=df.ds,
                    y=decomp.trend,
                    name="tendência",
                    mode="lines",
                    showlegend=True,
                ),
                secondary_y=True,
            )

        if sazonalidade:
            fig_decomp.add_trace(
                go.Scatter(
                    x=df.ds,
                    y=decomp.seasonal,
                    name="sazonalidade",
                    mode="lines",
                    showlegend=True,
                ),
                secondary_y=True,
            )

        if residuo:
            fig_decomp.add_trace(
                go.Scatter(
                    x=df.ds,
                    y=decomp.resid,
                    name="resíduo",
                    mode="lines",
                    showlegend=True,
                ),
                secondary_y=False,
            )

        fig_decomp.update_yaxes(
            title=dict(
                text="observado/resíduo",
                font=dict(family="system-ui", size=18)
            ),
            secondary_y=False,
            zerolinecolor="black",
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig_decomp.update_yaxes(
            title=dict(
                text="tendência/sazonalidade",
                font=dict(family="system-ui", size=18)
            ),
            secondary_y=True,
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig_decomp.update_xaxes(
            title=dict(
                text="Período",
                font=dict(family="system-ui", size=18)
            ),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig_decomp.update_layout(
            width=1500,
            height=700,
            plot_bgcolor="#c8d4e3",
            hovermode="x unified",
            title=dict(
                text="Decomposição da série temporal: {col}".format(col=c),
                font=dict(family="system-ui", size=24),
            ),
        )

        if salvar:
            fig_decomp.write_image(
                pasta_resultados+"aed/decomposicao_serie_{}.png".format(c)
            )
        else:
            fig_decomp.show()
# ============================================================================================ #
def estacionariedade(
    df: pd.DataFrame,
    sp: int
) -> None:

    # Avaliar a estacionariedade de cada uma das séries e a sazonalidade (se houver)
    # Existindo sazonalidade, qual a lag (ou quais lags) se encaixam nesta sazonalidade
    cols = df.drop(columns=["ds", "unique_id"]).columns.to_list()
    for c in cols:
        ts = df[c]
        sty_est = StationarityADF()
        sty_est.fit(ts)
        print(c, sty_est.get_fitted_params()["stationary"])

        # Este teste de sazonalidade deve ser aplicado a séries estacionárias.
        # Se precisar tornar uma série em estacionária, tem de aplicar diferenciação antes.
        if sty_est.get_fitted_params()["stationary"]:
            sp_est = SeasonalityACF( # Minha intenção é ter certeza de que existe sazonalidade anual (365 dias)
                candidate_sp=sp,
                nlags=len(df[c])
            )
            sp_est.fit(ts)
            sp_est.get_fitted_params()
            print(c, sp_est.get_fitted_params()["sp_significant"])
# ============================================================================================ #
def mapa_correlacao(
    df: pd.DataFrame,
    medida: str = "pearson",
    salvar: bool = False
) -> None:

    if medida == "dtw":
        from dtaidistance import dtw

        dtw_dist = dtw.distance_matrix_fast(df.drop(columns=["ds", "unique_id"]).T.values)
        
        df_dtw_dist = pd.DataFrame(
            data=dtw_dist,
            index=df.drop(columns=["ds", "unique_id"]).columns.to_list(),
            columns=df.drop(columns=["ds", "unique_id"]).columns.to_list(),
        )

        fig = go.Figure()

        fig.add_trace(
            go.Heatmap(
                x=df_dtw_dist.columns,
                y=df_dtw_dist.columns,
                z=df_dtw_dist,
                text=df_dtw_dist.values,
                texttemplate="%{text:.7f}",
                textfont={"size": 14},
                colorscale="rainbow",
                hovertemplate="%{y}<br>%{x}</br><extra></extra>",
            )
        )

        fig.update_yaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_xaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_layout(
            width=1500,
            height=700,
            title=dict(
                text="Mapa de correlação (DTW)",
                font=dict(family="system-ui", size=24)
            ),
        )

    elif medida == "pearson":

        corr = df.drop(columns=["ds", "unique_id"]).corr()

        fig = go.Figure()

        fig.add_trace(
            go.Heatmap(
                x=corr.columns,
                y=corr.columns,
                z=corr,
                text=corr.values,
                texttemplate="%{text:.7f}",
                textfont={"size": 14},
                colorscale="rainbow",
                hovertemplate="%{y}<br>%{x}</br><extra></extra>",
            )
        )

        fig.update_yaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_xaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_layout(
            width=1500,
            height=700,
            title=dict(
                text="Mapa de correlação (coeficiente de Pearson)",
                font=dict(family="system-ui", size=24),
            ),
        )

    else:
        raise Exception("Opção errada. ('dtw' ou 'pearson')")

    if salvar:
        fig.write_image(
            pasta_resultados+"aed/mapa_correlacao_{medida}.png".format(medida=medida)
        )
    else:
        fig.show()
# ============================================================================================ #
def plot_linha_tabela(
    df_merged: pd.DataFrame,
    regressor: str,
    plot_title: str,
    line_color: str,
    short_name: str,
    salvar: bool = False,
) -> None:

    fig = make_subplots(
        rows=2,
        cols=1,
        vertical_spacing=0.2,
        specs=[
            [{"type": "scatter"}],
            [{"type": "table"}]
        ],
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged["ds"],
            y=df_merged["y"],
            mode="lines+markers",
            name="observado",
            line=dict(
                color="#000000",
                width=2
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged["ds"],
            y=df_merged[regressor],
            mode="lines+markers",
            name=short_name,
            line=dict(color=line_color),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Table(
            header=dict(
                values=[
                    "SMAPE",
                    "RMSE",
                    "PBIAS (%)",
                    "DRV",
                ],
                font=dict(size=14),
                align="center"
            ),
            cells=dict(
                values=[
                    smape(df_merged.y, df_merged[regressor]),
                    rmse(df_merged.y, df_merged[regressor]),
                    percentage_bias(df_merged.y, df_merged[regressor]),
                    deviation_runoff_volume(df_merged.y, df_merged[regressor]),
                ],
                font=dict(size=12),
                height=30,
                align="left",
            ),
        ),
        row=2,
        col=1,
    )

    fig.update_yaxes(
        title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(text="Período", font=dict(family="system-ui", size=18)),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=1000,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(text=plot_title, font=dict(family="system-ui", size=24)),
    )

    if salvar:
        now = datetime.now()
        fig.write_image(
            pasta_resultados+"{reg}_{dt}.png".format(reg=regressor, dt=now.strftime("%Y-%m-%d_%H-%M-%S"))
        )
    else:
        fig.show()
# ============================================================================================ #
def cria_plot_correlacao(
    serie: pd.Series,
    n_lags: int,
    plot_pacf: bool = False,
    salvar: bool = False
) -> None:

    corr_array = (
        pacf(serie.dropna(), nlags=n_lags, alpha=0.05)
        if plot_pacf
        else acf(serie.dropna(), nlags=n_lags, alpha=0.05)
    )

    lower_y = corr_array[1][:, 0] - corr_array[0]
    upper_y = corr_array[1][:, 1] - corr_array[0]

    fig = go.Figure()

    # Desenha as linhas verticais pretas
    [
        fig.add_scatter(
            x=(x, x),
            y=(0, corr_array[0][x]),
            mode="lines",
            line_color="black",
            hovertemplate="<extra></extra>",
        )
        for x in range(len(corr_array[0]))
    ]

    # Desenha as bolinhas vermelhas
    fig.add_scatter(
        x=np.arange(len(corr_array[0])),
        y=corr_array[0],
        mode="markers",
        marker_color="red",
        marker_size=12,
        hovertemplate="x = %{x}<br>y = %{y}<extra></extra>",
    )

    # Desenha a 'nuvem' clarinha acima do eixo x
    fig.add_scatter(
        x=np.arange(len(corr_array[0])),
        y=upper_y,
        mode="lines",
        line_color="rgba(255,255,255,0)",
        hovertemplate="<extra></extra>",
    )

    # Desenha a 'nuvem' clarinha abaixo do eixo x
    fig.add_scatter(
        x=np.arange(len(corr_array[0])),
        y=lower_y,
        mode="lines",
        fillcolor="rgba(32, 146, 230,0.3)",
        fill="tonexty",
        line_color="rgba(255,255,255,0)",
        hovertemplate="<extra></extra>",
    )

    fig.update_traces(showlegend=False)

    fig.update_xaxes(
        range=[-1, n_lags + 1],
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_yaxes(
        zerolinecolor="black",  # Quando 'y=0' a linha é preta
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    title = (
        "Autocorrelação Parcial (PACF) para n_lags={n}".format(n=n_lags)
        if plot_pacf
        else "Autocorrelação (ACF) para n_lags={n}".format(n=n_lags)
    )
    fig.update_layout(
        width=1500,
        height=700,
        plot_bgcolor="#c8d4e3",
        title=dict(text=title, font=dict(family="system-ui", size=24)),
    )

    if salvar:
        (
            fig.write_image(pasta_resultados+"aed/plot_pacf.png")
            if plot_pacf
            else fig.write_image(pasta_resultados+"aed/plot_acf.png")
        )
    else:
        fig.show()
# ============================================================================================ #
def cria_dataframe_futuro(
    df_futr: pd.DataFrame,
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    tp_valor: str,
    n_lags: int,
    date_features: list,
    cols: list,
) -> pd.DataFrame:
    
    """
        tp_valor == "ultimo": # Usa o último valor conhecido
        tp_valor == "media":  # Usa o valor médio de cada coluna vazão
        tp_valor == "ml":     # Usa um modelo XGBoost para gerar previsão futuras das vazões auxiliares
    """

    if tp_valor == "ultimo":  # Usa o último valor conhecido
        for c in cols:
            df_futr[c] = df_train[c].iat[-1]

    elif tp_valor == "media":  # Usa o valor médio de cada coluna vazão
        for c in cols:
            df_futr[c] = df_train[c].mean()

    elif tp_valor == "ml":
        from xgboost import XGBRegressor

        for c in cols:
            fcst = mlf.MLForecast(
                models=XGBRegressor(seed=SEED),
                freq="D",
                lags=[i + 1 for i in range(n_lags)],
                date_features=date_features,
            )

            df_temp = df_train[["ds", "unique_id", c]]

            fcst.fit(
                df_temp,
                id_col="unique_id",
                time_col="ds",
                target_col=c,
                static_features=[],
            )

            df_preds = fcst.predict(h=len(df_futr)).reset_index()  # macetasso pra não dar erro de index
            df_futr[c] = df_preds["XGBRegressor"]

    else:
        raise Exception("Opção inválida! (ultimo | media | ml)")

    df_futr = pd.merge(
        left=df_futr,
        right=df_test.drop(columns=cols + ["y"]),
        on=["ds", "unique_id"],
        how="left",
    )

    return df_futr
# ============================================================================================ #
def distribuicao_dados(
    df_original: pd.DataFrame,
    df_media: pd.DataFrame,
    df_knn: pd.DataFrame,
    salvar: bool = False,
) -> None:

    cols = np.asarray(df_original.drop(columns=["ds", "unique_id"]).columns)

    for c in cols:
        fig = go.Figure()

        fig.add_trace(
            go.Box(
                y=df_original[c].values,
                name="original",
                marker_color="darkblue",
                jitter=0.5,
                pointpos=-2,
                boxpoints="all",
                boxmean="sd",
            )
        )

        fig.add_trace(
            go.Box(
                y=df_media[c].values,
                name="média",
                marker_color="coral",
                jitter=0.5,
                pointpos=-2,
                boxpoints="all",
                boxmean="sd",
            )
        )

        fig.add_trace(
            go.Box(
                y=df_knn[c].values,
                name="kNN",
                marker_color="olive",
                jitter=0.5,
                pointpos=-2,
                boxpoints="all",
                boxmean="sd",
            )
        )

        fig.update_xaxes(
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_yaxes(
            zerolinecolor="black",
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_layout(
            width=1500,
            height=1000,
            plot_bgcolor="#c8d4e3",
            title=dict(
                text="Distribuição {c}".format(c=c),
                font=dict(family="system-ui", size=24),
            ),
        )

        if salvar:
            fig.write_image(
                pasta_resultados+"aed/distribuicao_dados_{}.png".format(c)
            )
        else:
            fig.show()
# ============================================================================================ #
def exportar_dict_json(
    v_dict: dict,
    pasta: str,
    nome_arq: str
) -> None:
    
    if not os.path.exists(pasta):
        os.makedirs(pasta)

    json_str = json.dumps(v_dict, indent=4)
    with open(pasta + nome_arq, "w") as a:
        a.write(json_str)
# ============================================================================================ #
def plot_divisao_treino_teste(
    df_treino: pd.DataFrame,
    df_teste: pd.DataFrame,
    col_data: str = "ds",
    col_plot: str = "y",
    salvar: bool = False,
) -> None:

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df_treino[col_data],
            y=df_treino[col_plot],
            mode="lines",
            name="treino"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=df_teste[col_data],
            y=df_teste[col_plot],
            mode="lines",
            name="teste"
        )
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s) / Precipitação (mm/dia)",
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(family="system-ui", size=18)
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=700,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text="Vazão 'y' (target)",
            font=dict(family="system-ui", size=24)
        ),
    )

    if salvar:
        fig.write_image(
            pasta_resultados+"aed/divisao_treino_teste_{c}.png".format(c=col_plot)
        )
    else:
        fig.show()
# ============================================================================================ #
def plot_resultados(
    df_merged: pd.DataFrame,
    modelo: str,
    nome_curto : str,
    fh : int,
    titulo : str,
    pasta_dstn : str,
    niveis : list = None,
    cores : list = None,
    salvar: bool = False,
    n_decimal: int = 5,
    metricas: str = "padrao",
    marcadores: bool = True
) -> None:
    """
        df_merged:
            O DataFrame com todas as previsões, os valores observados e os quartis (se houver)

        modelo:
            String com o nome do modelo que será referenciado no DataFrame (df_merged[modelo])

        nome_curto:
            Nome curto do modelo para colocar na legenda do gráfico

        fh:
            Horizonte de previsão para colocar no título do gráfico

        titulo:
            O título do gráfico

        pasta_dstn:
            Pasta onde salvar a imagem

        niveis:
            Array, em ordem do maior para o menor, com os níveis dos quartis

        cores:
            Cores para aplicar ao plotar os quartis no gráfico

        salvar:
            Se vai salvar diretamente para um arquivo ou se vai renderizar em tela o gráfico

        n_decimal:
            Para arredondar as casas decimais dos números na tabela com as métricas

        metricas:
            Se vai gerar a tabela com as métricas resumidas com as métricas padrão ou as mais comumente usadas em Hidrologia

        marcadores:
            Se as linhas dos gráficos serão lisas ou com marcadores
    """

    mtrcs = {}
    if metricas == "padrao":
        mtrcs[modelo] = {
            "SMAPE": smape(df_merged["y"], df_merged[modelo]),
            "RMSE": rmse(df_merged["y"], df_merged[modelo]),
            "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo]),
            # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo]),
        }

        if niveis is not None:
            for n in niveis:
                mtrcs[modelo+"-lo-"+n] = {
                    "SMAPE": smape(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "RMSE": rmse(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                }

                mtrcs[modelo+"-hi-"+n] = {
                    "SMAPE": smape(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "RMSE": rmse(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                }
    elif metricas == "hidrologia":
        mtrcs[modelo] = {
            "SMAPE": smape(df_merged["y"], df_merged[modelo]),
            "KGEnp": kling_gupta_efficiency_non_parametric(df_merged["y"], df_merged[modelo]),
            "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo]),
            # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo]),
        }

        if niveis is not None:
            for n in niveis:
                mtrcs[modelo+"-lo-"+n] = {
                    "SMAPE": smape(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "KGEnp": kling_gupta_efficiency_non_parametric(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                }

                mtrcs[modelo+"-hi-"+n] = {
                    "SMAPE": smape(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "KGEnp": kling_gupta_efficiency_non_parametric(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                }
    else:
        raise Exception("Opção inválida para métrica! ('padrao' | 'hidrologia')")

    df_tbl = pd.DataFrame(mtrcs).T.reset_index(names="Modelo").round(n_decimal) # Arredondando para "n_decimal" casas decimais

    fig = make_subplots(
        rows=2,
        cols=1,
        vertical_spacing=0.2,
        specs=[
            [{"type": "scatter"}],
            [{"type": "table"}]
        ],
    )

    if niveis is not None and cores is not None:
        for n, c in zip(niveis, cores):
            fig.add_trace(
                go.Scatter(
                    x=df_merged["ds"],
                    y=df_merged[modelo+"-hi-"+n],
                    mode="lines+markers" if marcadores else "lines",
                    name=nome_curto+"-hi-"+n,
                    line=dict(color=c),
                ),
                row=1,
                col=1,
            )

            fig.add_trace(
                go.Scatter(
                    x=df_merged["ds"],
                    y=df_merged[modelo+"-lo-"+n],
                    mode="lines+markers" if marcadores else "lines",
                    name=nome_curto+"-lo-"+n,
                    fill="tonexty",
                    line=dict(color=c),
                ),
                row=1,
                col=1,
            )

    fig.add_trace(
        go.Scatter(
            x=df_merged["ds"],
            y=df_merged[modelo],
            mode="lines+markers" if marcadores else "lines",
            name=nome_curto,
            line=dict(
                color="magenta",
                width=4
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged["ds"],
            y=df_merged["y"],
            mode="lines+markers" if marcadores else "lines",
            name="observado",
            line=dict(
                color="black",
                width=2
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Table(
            header=dict(
                values=df_tbl.columns.to_list(),
                font=dict(size=18),
                align="center"
            ),
            cells=dict(
                values=df_tbl.T,
                font=dict(size=18),
                height=30,
                align="left"
            ),
        ),
        row=2,
        col=1,
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s)",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_traces(
        hovertemplate=None,
        row=1,
        col=1
    )

    fig.update_layout(
        width=1500,
        height=1000,
        plot_bgcolor="#c8d4e3",
        hovermode="x unified",
        title=dict(
            text=titulo,
            font=dict(
                family="system-ui",
                size=30
            ),
        ),
    )

    if salvar:
        if not os.path.exists(pasta_dstn):
            os.makedirs(pasta_dstn)

        now = datetime.now()
        fig.write_image(
            pasta_dstn+"/{md}_fh{fh}_{dt}.png".format(
                fh=fh,
                md=modelo,
                dt=now.strftime("%Y-%m-%d_%H-%M-%S")
            )
        )
    else:
        fig.show()
# ============================================================================================ #
def gerar_atributos_data(
    df : pd.DataFrame,
    atributos : list,
    col_data : str = "ds",
) -> pd.DataFrame:
    
    df_result = df.copy()
    if atributos is not None:
        for a in atributos:
            if a in ("week", "weekofyear"):
                w = pd.to_datetime(df_result[col_data]).dt.isocalendar()
                df_result[a] = getattr(w, a)
            else:
                df_result[a] = getattr(pd.to_datetime(df_result[col_data]).dt, a)
    
    return df_result
# ============================================================================================ #
# def plot_feature_importance(
#     model: str,
#     forecaster: mlf.MLForecast,
#     fch: str = "",
#     salvar: bool = False
# ) -> None:

#     if model in ["LinearRegression", "LinearSVR"]:
#         fig = go.Figure(
#             data=[
#                 go.Bar(
#                     x=forecaster.ts.features_order_,
#                     y=forecaster.models_[model].coef_,
#                     showlegend=False,
#                 )
#             ]
#         )
#     elif model == "LGBMRegressor":
#         fig = go.Figure(
#             data=[
#                 go.Bar(
#                     x=forecaster.ts.features_order_,
#                     y=forecaster.models_[model].feature_importances_,
#                     showlegend=False,
#                 )
#             ]
#         )
#     else:
#         raise Exception("Esta opção não existe.")

#     fig.update_yaxes(
#         title=dict(text="Pesos/Valores", font=dict(family="system-ui", size=18)),
#         zerolinecolor="black",
#         mirror=True,
#         ticks="outside",
#         showline=True,
#         linecolor="black",
#     )

#     fig.update_xaxes(
#         title=dict(text="Feature", font=dict(family="system-ui", size=18)),
#         mirror=True,
#         ticks="outside",
#         showline=True,
#         linecolor="black",
#     )

#     fig.update_layout(
#         width=1500,
#         height=700,
#         plot_bgcolor="#c8d4e3",
#         title=dict(
#             text="Feature importance {m} (fch={fch})".format(m=model, fch=fch),
#             font=dict(family="system-ui", size=24),
#         ),
#     )

#     if salvar:
#         now = datetime.now()
#         fig.write_image(
#             pasta_resultados+"feature_importance/feature_importance_{m}_fch{fch}_{dt}.png".format(
#                 m=model,
#                 fch=fch,
#                 dt=now.strftime("%Y-%m-%d_%H-%M-%S")
#             )
#         )
#     else:
#         fig.show()
# ============================================================================================ #

# Carregando e imputando dados

In [14]:
df = pd.read_excel(
    io="./arquivos_finais/alto_rio_doce_final.xlsx",
    sheet_name=0,
    index_col=0,
    header=0,
    parse_dates=["Data"],
)

In [15]:
# Só reordenando a posição das colunas pra ficar mais fácil de ler e entender
df = df[[
    "c_vz_56425000", # target y
    "t_cv_56425000",
    "t_cv_56338500",
    "t_cv_56338080",
    "t_cv_56110005",
    "t_cv_56337200",
    "t_cv_56337500",
    "t_vz_56338500",
    "t_vz_56110005",
    "t_vz_56337200",
    "t_vz_56337500",
]]

# Deixando o DataFrame no padrão que a lib MLForecast obriga
df["unique_id"] = 1
df = df.reset_index()
df = df.rename(columns={
    "Data": "ds",
    "c_vz_56425000": "y"
    })

df

Unnamed: 0,ds,y,t_cv_56425000,t_cv_56338500,t_cv_56338080,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500,unique_id
0,2013-01-01,82.787100,,,,0.0,,,,60.056100,,,1
1,2013-01-02,80.489300,,,,0.0,,,,46.950000,,,1
2,2013-01-03,78.214200,,,,0.0,,,,46.703125,,,1
3,2013-01-04,79.348900,,,,0.0,,,,49.239583,,,1
4,2013-01-05,129.161000,,,,0.0,,,,49.780208,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,88.370937,8.2,0.2,0.0,0.0,2.2,8.6,99.451667,70.568804,0.610000,28.053750,1
4013,2023-12-28,87.519375,2.0,1.2,0.0,0.0,0.0,0.8,93.551667,60.014457,33.753333,21.705417,1
4014,2023-12-29,76.317813,0.0,0.0,0.0,0.0,0.0,0.0,77.745652,49.910909,61.942917,21.022917,1
4015,2023-12-30,67.118542,14.8,3.4,0.0,3.0,4.8,19.8,70.258261,47.746848,58.080000,22.252500,1


In [16]:
# Percentual de dados faltantes, por coluna

print(100 * df.drop(columns=["ds", "unique_id"]).isna().sum() / len(df))

y                 0.000000
t_cv_56425000    40.801593
t_cv_56338500    34.478467
t_cv_56338080    38.411750
t_cv_56110005     0.000000
t_cv_56337200    34.478467
t_cv_56337500    34.478467
t_vz_56338500    35.648494
t_vz_56110005     0.074683
t_vz_56337200    35.822753
t_vz_56337500    48.095594
dtype: float64


## Preenchendo com a média

In [17]:
df_media = df.fillna(df.mean())
df_media

Unnamed: 0,ds,y,t_cv_56425000,t_cv_56338500,t_cv_56338080,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500,unique_id
0,2013-01-01,82.787100,3.193061,2.621505,0.037914,0.0,2.482675,2.246239,114.113770,60.056100,72.465690,29.862489,1
1,2013-01-02,80.489300,3.193061,2.621505,0.037914,0.0,2.482675,2.246239,114.113770,46.950000,72.465690,29.862489,1
2,2013-01-03,78.214200,3.193061,2.621505,0.037914,0.0,2.482675,2.246239,114.113770,46.703125,72.465690,29.862489,1
3,2013-01-04,79.348900,3.193061,2.621505,0.037914,0.0,2.482675,2.246239,114.113770,49.239583,72.465690,29.862489,1
4,2013-01-05,129.161000,3.193061,2.621505,0.037914,0.0,2.482675,2.246239,114.113770,49.780208,72.465690,29.862489,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,88.370937,8.200000,0.200000,0.000000,0.0,2.200000,8.600000,99.451667,70.568804,0.610000,28.053750,1
4013,2023-12-28,87.519375,2.000000,1.200000,0.000000,0.0,0.000000,0.800000,93.551667,60.014457,33.753333,21.705417,1
4014,2023-12-29,76.317813,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,77.745652,49.910909,61.942917,21.022917,1
4015,2023-12-30,67.118542,14.800000,3.400000,0.000000,3.0,4.800000,19.800000,70.258261,47.746848,58.080000,22.252500,1


## Preenchendo com o KNNImputer

In [18]:
# Recomendam aplicar um scaling antes de imputar com o KNNImputer, mas nos testes que realizei deu nenhuma diferença nos resultados
# Então vou reduzir a engenharia de programação e não usar scaling

imputer = KNNImputer(
    n_neighbors=14,
    weights="distance"
)

df_knn = pd.DataFrame(
    data=imputer.fit_transform(df.drop(columns=["ds", "unique_id"])),
    columns=df.drop(columns=["ds", "unique_id"]).columns,
)

df_knn = pd.DataFrame(
    data=df_knn,
    columns=df.drop(columns=["ds", "unique_id"]).columns
)

df_knn = pd.concat(
    [df[["ds", "unique_id"]], df_knn],
    axis=1
)

df_knn

Unnamed: 0,ds,unique_id,y,t_cv_56425000,t_cv_56338500,t_cv_56338080,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
0,2013-01-01,1,82.787100,0.288835,0.429300,0.0,0.0,1.103340,0.529025,80.179006,60.056100,52.139321,22.604185
1,2013-01-02,1,80.489300,0.313813,0.389303,0.0,0.0,0.206381,0.254328,78.548218,46.950000,38.726858,34.659737
2,2013-01-03,1,78.214200,0.387696,0.565639,0.0,0.0,0.210559,0.378253,78.240361,46.703125,38.830093,42.538983
3,2013-01-04,1,79.348900,0.289002,0.324881,0.0,0.0,0.071081,0.220642,77.485572,49.239583,47.964186,23.318587
4,2013-01-05,1,129.161000,4.115313,0.696857,0.0,0.0,1.161379,2.309821,104.090940,49.780208,64.301148,30.217296
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,1,88.370937,8.200000,0.200000,0.0,0.0,2.200000,8.600000,99.451667,70.568804,0.610000,28.053750
4013,2023-12-28,1,87.519375,2.000000,1.200000,0.0,0.0,0.000000,0.800000,93.551667,60.014457,33.753333,21.705417
4014,2023-12-29,1,76.317813,0.000000,0.000000,0.0,0.0,0.000000,0.000000,77.745652,49.910909,61.942917,21.022917
4015,2023-12-30,1,67.118542,14.800000,3.400000,0.0,3.0,4.800000,19.800000,70.258261,47.746848,58.080000,22.252500


In [19]:
# Quantos '0' existem nas colunas de vazão
# O percentual, no caso

cols = [
    "t_vz_56338500",
    "t_vz_56110005",
    "t_vz_56337200",
    "t_vz_56337500"
]

for c in cols:
    print(c, 100 * (df_knn[c] == 0).sum() / len(df_knn))

# uma estação de vazão com 4,5% de 0's? O rio secou??

t_vz_56338500 0.0
t_vz_56110005 0.0
t_vz_56337200 0.024894199651481205
t_vz_56337500 4.5556385362210605


## Distribuição comparada

In [20]:
distribuicao_dados(
    df_original=df,
    df_media=df_media,
    df_knn=df_knn,
    salvar=SALVAR_PLOTS
)

In [21]:
# Essa coluna tá ruim demais, vou retirar.
# Maioria dos valores está 'colada' no 0.
df_knn = df_knn.drop(columns=["t_cv_56338080"])

## Separando dados para 'X' e 'y'

Não sei se vai ser necessário usá-los, mas já deixo aqui pra caso precise

In [22]:
df_knn.columns

Index(['ds', 'unique_id', 'y', 't_cv_56425000', 't_cv_56338500',
       't_cv_56110005', 't_cv_56337200', 't_cv_56337500', 't_vz_56338500',
       't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
      dtype='object')

In [23]:
df_X = df_knn.drop(columns=["y"], axis=1)
df_y = df_knn[["ds", "y", "unique_id"]]

In [24]:
df_X

Unnamed: 0,ds,unique_id,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
0,2013-01-01,1,0.288835,0.429300,0.0,1.103340,0.529025,80.179006,60.056100,52.139321,22.604185
1,2013-01-02,1,0.313813,0.389303,0.0,0.206381,0.254328,78.548218,46.950000,38.726858,34.659737
2,2013-01-03,1,0.387696,0.565639,0.0,0.210559,0.378253,78.240361,46.703125,38.830093,42.538983
3,2013-01-04,1,0.289002,0.324881,0.0,0.071081,0.220642,77.485572,49.239583,47.964186,23.318587
4,2013-01-05,1,4.115313,0.696857,0.0,1.161379,2.309821,104.090940,49.780208,64.301148,30.217296
...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,1,8.200000,0.200000,0.0,2.200000,8.600000,99.451667,70.568804,0.610000,28.053750
4013,2023-12-28,1,2.000000,1.200000,0.0,0.000000,0.800000,93.551667,60.014457,33.753333,21.705417
4014,2023-12-29,1,0.000000,0.000000,0.0,0.000000,0.000000,77.745652,49.910909,61.942917,21.022917
4015,2023-12-30,1,14.800000,3.400000,3.0,4.800000,19.800000,70.258261,47.746848,58.080000,22.252500


In [25]:
df_y

Unnamed: 0,ds,y,unique_id
0,2013-01-01,82.787100,1
1,2013-01-02,80.489300,1
2,2013-01-03,78.214200,1
3,2013-01-04,79.348900,1
4,2013-01-05,129.161000,1
...,...,...,...
4012,2023-12-27,88.370937,1
4013,2023-12-28,87.519375,1
4014,2023-12-29,76.317813,1
4015,2023-12-30,67.118542,1


In [26]:
# from tsfresh.feature_extraction.feature_calculators import number_peaks

# number_peaks(df_y.drop(columns=['ds', 'unique_id']), 7)

# Análise exploratória dos dados

## Séries Temporais

In [27]:
plot_serie_temporal(
    dataset=df_knn,
    coluna="y",
    tp_coluna=True,
    plot_title="Série Temporal completa: y",
    line_color="darkred",
    short_name="y",
    salvar=SALVAR_PLOTS
)

## Decomposição das Séries Temporais

A decomposição das séries temporais ajuda a detectar padrões (tendência, sazonalidade) e identificar outras informações que podem ajudar na interpretação do que está acontecendo.

Executei a tarefa no atributo "df" pois isso me garante que estou tratando dos dados originais, sem alteração nenhuma, vindos do arquivo CSV.

In [28]:
decomp_series(
    df=df_knn,
    tendencia=True,
    sazonalidade=False,
    residuo=False,
    salvar=SALVAR_PLOTS
)

## Estacionariedade

In [29]:
estacionariedade(
    df=df_knn,
    sp=365
)

y True
y [365]
t_cv_56425000 True
t_cv_56425000 [365]
t_cv_56338500 True
t_cv_56338500 [365]
t_cv_56110005 True
t_cv_56110005 [365]
t_cv_56337200 True
t_cv_56337200 [365]
t_cv_56337500 True
t_cv_56337500 [365]
t_vz_56338500 True
t_vz_56338500 [365]
t_vz_56110005 True
t_vz_56110005 [365]
t_vz_56337200 True
t_vz_56337200 [365]
t_vz_56337500 True
t_vz_56337500 []


A série 't_vz_56337500' é estacionária, contudo, na lag 365 ela não apresenta sazonalidade.

## Correlação entre as séries

In [30]:
mapa_correlacao(
    df=df_knn,
    medida="pearson",
    salvar=SALVAR_PLOTS
)

In [31]:
# Usando o sweetviz para avaliar
# import sweetviz as sv
# analyze_report = sv.analyze(df_knn)
# analyze_report.show_html('analyze.html', open_browser=True)

# Apresentando os resultados (serve apenas para usar no Google Colab)
# import IPython
# IPython.display.HTML('analyze.html')

In [32]:
# Preferi jogar os dados alterados para um novo DataFrame porque se precisar voltar no DataFrame inicial, não precisará regarregar o arquivo
df_aux = df_knn.copy()

In [33]:
# mapa_correlacao(
#     df=df_aux,
#     salvar=SALVAR_PLOTS
# )

## Análise de Autocorrelação

In [34]:
# Me interessa saber a sazonalidade da variável-alvo, a vazão
cria_plot_correlacao(
    serie=df_aux.y,
    n_lags=500,
    plot_pacf=False,
    salvar=SALVAR_PLOTS
)

# Na lag 365 o gráfico volta a descer.
# Isso nos dá uma visão da sazonalidade da série, que é de 365 dias

In [35]:
# vazoes = ['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500']
# chuvas = ['t_cv_56425000', 't_cv_56338500', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500']
cria_plot_correlacao(
    serie=df_aux["y"],
    n_lags=15,
    plot_pacf=True,
    salvar=SALVAR_PLOTS
)

# Dá pra ver que depois de 2 lags, ocorre um drop no gráfico.
# Isso nos dá um indício de quantas lags usar em "look_back".
# Se fosse um modelo tipo ARIMA a ser utilizado, ele seria AR(2)

## Relação entre as variáveis

In [36]:
salvar = SALVAR_PLOTS

vazoes = [
    "t_vz_56338500",
    "t_vz_56110005",
    "t_vz_56337200",
    "t_vz_56337500"
]

for v in vazoes:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df_aux[v],
            y=df_aux["y"],
            mode="markers",
            line=dict(color="blue"),
            hovertemplate="eixo_x: %{x}<br>eixo_y: %{y}</br><extra></extra>",
            showlegend=False,
        )
    )

    fig.update_xaxes(
        title=dict(
            text=df_aux[v].name, 
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        showspikes=True,
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_yaxes(
        title=dict(
            text=df_aux["y"].name,
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        showspikes=True,
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=700,
        hovermode="closest",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text="Relação entre as variáveis 'y' e '{v}'".format(v=v),
            font=dict(family="system-ui", size=24),
        ),
    )

    if salvar:
        fig.write_image(
            "./resultados/trecho_alto/aed/relacao_y_{v}.png".format(v=v)
        )
    else:
        fig.show()

# ============================================================================ #

chuvas = [
    "t_cv_56425000",
    "t_cv_56338500",
    "t_cv_56110005",
    "t_cv_56337200",
    "t_cv_56337500",
]

for c in chuvas:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df_aux[c],
            y=df_aux["y"],
            mode="markers",
            line=dict(color="green"),
            hovertemplate="eixo_x: %{x}<br>eixo_y: %{y}</br><extra></extra>",
            showlegend=False,
        )
    )

    fig.update_yaxes(
        title=dict(
            text=df_aux["y"].name,
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        showspikes=True,
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text=df_aux[c].name,
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        showspikes=True,
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=700,
        hovermode="closest",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text="Relação entre as variáveis 'y' e '{c}'".format(c=c),
            font=dict(family="system-ui", size=24),
        ),
    )

    if salvar:
        fig.write_image(
            pasta_resultados+"aed/relacao_y_{c}.png".format(c=c)
        )
    else:
        fig.show()

## Análise de delay

In [37]:
# import matplotlib.pyplot as plt
# from dtaidistance import dtw
# from dtaidistance import dtw_visualisation as dtw_vis

# dtw_dist = dtw.distance_matrix_fast(df_aux.drop(columns=["ds", "unique_id"]).T.values)

# df_dtw_dist = pd.DataFrame(
#     data=dtw_dist,
#     index=df_aux.drop(columns=["ds", "unique_id"]).columns.to_list(),
#     columns=df_aux.drop(columns=["ds", "unique_id"]).columns.to_list(),
# )

# fig, axs = plt.subplots(
#     nrows=2,
#     ncols=1,
#     figsize=(2560 / 96, 1440 / 96),
# )

# path = dtw.warping_path(
#     from_s=df_aux["t_vz_56338500"].tail(60).T.values,
#     to_s=df_aux["y"].tail(60).T.values,
# )

# dtw_vis.plot_warping(
#     s1=df_aux["t_vz_56338500"].tail(60).T.values,
#     s2=df_aux["y"].tail(60).T.values,
#     path=path,
#     fig=fig,
#     axs=axs,
#     series_line_options={
#         "linewidth": 3.0,
#         "color": "blue",
#         "alpha": 0.5
#     },
#     warping_line_options={
#         "linewidth": 1.0,
#         "color": "red",
#         "alpha": 1.0
#     },
# )

# axs[1].set_xlabel("Lags")
# axs[0].set_ylabel("Vazão ($m^3$/s) - t_vz_56338500")
# axs[1].set_ylabel("Vazão ($m^3$/s) - y")
# fig.show()

## Granger-causality

In [38]:
df_aux.columns

Index(['ds', 'unique_id', 'y', 't_cv_56425000', 't_cv_56338500',
       't_cv_56110005', 't_cv_56337200', 't_cv_56337500', 't_vz_56338500',
       't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
      dtype='object')

In [39]:
# from statsmodels.tsa.stattools import grangercausalitytests

# # vazões ['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500']
# # chuvas ['t_cv_56425000', 't_cv_56338500', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500']

# df_granger = pd.DataFrame()
# df_granger = df_aux.drop(columns=["ds", "unique_id"]).diff(1)  # aplica essa diferenciação pra remover qq efeito de tendência
# df_granger = df_granger.dropna()

# grangercausalitytests(
#     x=df_granger[["y", "t_cv_56337500"]].tail(30),
#     maxlag=7,
#     verbose=True
# )

# Variáveis globais

In [40]:
look_back = 7 # Lags a serem utilizadas. Uma semana passada.

n_folds = 5

fh_v = [1, 3, 7, 15]  # Horizonte de Previsão (como a frequência dos dados é diária, isso significa "fch" dias)

fh_artigo = [1, 3, 7]  # Horizonte de Previsão inspirado no artigo da Alemanha

intervalos_previsao = [95]

# No gráfico será mostrado apenas os níveis que estiverem aqui.
# Deve ser posto na ordem inversa, ou seja, do maior pro menor nível.
# intervalos_previsao_plotar = ["95", "80"]
intervalos_previsao_plotar = ["95"]

colunas_chuva = ['t_cv_56425000', 't_cv_56338500', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500']
colunas_vazao = ['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500']

atributos_data = ["dayofyear", "month", "quarter"]

# Cenário de experimentação
cenario1 = "sem_chuva_sem_vazao"
cenario2 = "com_chuva_sem_vazao"
cenario3 = "com_chuva_com_vazao"

# Separação dos dados

In [41]:
# Os dados do ano de 2023 serão um DataFrame especial

df_2023 = df_aux[pd.to_datetime(df_aux['ds']).dt.year == 2023].copy()
df_2023

Unnamed: 0,ds,unique_id,y,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
3652,2023-01-01,1,244.788000,7.8,0.0,17.2,0.2,0.8,184.923333,163.248542,131.050000,39.470000
3653,2023-01-02,1,244.788000,1.8,14.6,3.0,14.4,3.8,208.862955,191.474130,140.426250,62.823750
3654,2023-01-03,1,382.555000,33.8,11.8,8.8,0.2,15.0,354.716458,293.298000,229.080526,79.808750
3655,2023-01-04,1,494.398000,25.6,2.8,0.6,101.2,11.4,389.951875,286.602604,222.368889,76.670625
3656,2023-01-05,1,388.502000,2.4,4.2,2.2,101.8,3.4,295.012292,237.293750,193.085000,66.404167
...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,1,88.370937,8.2,0.2,0.0,2.2,8.6,99.451667,70.568804,0.610000,28.053750
4013,2023-12-28,1,87.519375,2.0,1.2,0.0,0.0,0.8,93.551667,60.014457,33.753333,21.705417
4014,2023-12-29,1,76.317813,0.0,0.0,0.0,0.0,0.0,77.745652,49.910909,61.942917,21.022917
4015,2023-12-30,1,67.118542,14.8,3.4,3.0,4.8,19.8,70.258261,47.746848,58.080000,22.252500


In [42]:
df_dados = df_aux.drop(index=df_2023.index).copy()
df_dados

Unnamed: 0,ds,unique_id,y,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
0,2013-01-01,1,82.7871,0.288835,0.429300,0.0,1.103340,0.529025,80.179006,60.056100,52.139321,22.604185
1,2013-01-02,1,80.4893,0.313813,0.389303,0.0,0.206381,0.254328,78.548218,46.950000,38.726858,34.659737
2,2013-01-03,1,78.2142,0.387696,0.565639,0.0,0.210559,0.378253,78.240361,46.703125,38.830093,42.538983
3,2013-01-04,1,79.3489,0.289002,0.324881,0.0,0.071081,0.220642,77.485572,49.239583,47.964186,23.318587
4,2013-01-05,1,129.1610,4.115313,0.696857,0.0,1.161379,2.309821,104.090940,49.780208,64.301148,30.217296
...,...,...,...,...,...,...,...,...,...,...,...,...
3647,2022-12-27,1,319.3000,2.600000,5.400000,5.8,7.200000,1.200000,239.780417,152.659896,128.750000,88.303958
3648,2022-12-28,1,263.6290,0.000000,0.000000,0.0,0.000000,0.000000,197.480833,156.578043,124.325417,61.413125
3649,2022-12-29,1,290.1000,0.000000,0.600000,10.6,43.200000,6.000000,216.428958,186.059462,146.491429,50.330417
3650,2022-12-30,1,262.7620,0.000000,1.000000,3.4,0.000000,0.600000,211.274792,166.879263,139.068235,46.562917


In [43]:
# df_treino, df_teste = temporal_train_test_split(
#     y=df_dados,
#     test_size=0.15,
#     anchor="start"
# )

In [44]:
# plot_divisao_treino_teste(
#     df_treino=df_treino,
#     df_teste=df_teste,
#     col_data="ds",
#     col_plot="y",
#     salvar=SALVAR_PLOTS
# )

# Séries sem Chuva, sem Vazão

O DataFrame tem apenas as colunas "ds", "unique_id", "y". Foram removidas as colunas de vazão à montante e as colunas de chuva.

## StatsForecast

### Baseline - SeasonalNaive

In [45]:
for f in fh_v:
    teste = df_dados.tail(f).copy()
    treino = df_dados.drop(index=teste.index).copy()

    modelo = SeasonalNaive(season_length=365)

    stfc = StatsForecast(
        df=treino[['ds', 'unique_id', 'y']],
        models=[modelo],
        freq="D",
        n_jobs=8
    )

    previsoes = stfc.forecast(
        h=f,
        level=intervalos_previsao
    )

    df_resultado = pd.merge(
        left=previsoes,
        right=teste[['ds', 'unique_id', 'y']],
        how="left",
        on=['ds', 'unique_id']
    )

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="SN",
        fh=f,
        titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario1,
        salvar=False, #SALVAR_PLOTS
    )

In [46]:
from tsfresh.feature_extraction.feature_calculators import number_peaks

number_peaks(
    x=df_resultado.SeasonalNaive,
    n=3
)

2

## MLForecast

### Baseline - Decision Tree

O emprego de Árvore de Decisão deve-se à característica do modelo em ser agnóstico à escala dos dados.

In [47]:
for f in fh_v:
    teste = df_dados.tail(f).copy()
    treino = df_dados.drop(index=teste.index).copy()

    dt = DecisionTreeRegressor(random_state=SEED)

    fcst = mlf.MLForecast(
        models=[dt],
        freq="D",
        lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
        num_threads=8,
        date_features=atributos_data,
    )

    fcst.fit(
        df=treino[["ds", "unique_id", "y"]],
        id_col="unique_id",
        time_col="ds",
        target_col="y",
        static_features=[],
        prediction_intervals=PredictionIntervals(h=f),
    )

    previsoes = fcst.predict(
        h=f,
        level=intervalos_previsao,
    )

    df_resultado = pd.merge(
        left=previsoes,
        right=teste[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left"
    )
    
    df_resultado.rename(
        columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
        inplace=True,
    )

    plot_resultados(
        df_merged=df_resultado,
        modelo="DecisionTree",
        nome_curto="DT",
        fh=f,
        titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario1),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario1,
        salvar=SALVAR_PLOTS
    )

In [48]:
from tsfresh.feature_extraction.feature_calculators import number_peaks

print(
    "n. picos de y_true", number_peaks(x=df_resultado.y, n=1),
    "\n",
    "n. picos de y_pred", number_peaks(x=df_resultado.DecisionTree, n=1),
)

n. picos de y_true 3 
 n. picos de y_pred 1


## NeuralForecast

### Main model - LSTM

Este é o modelo que se pretende aplicar no trabalho

In [49]:
for f in fh_v:
    teste = df_dados.tail(f).copy()
    treino = df_dados.drop(index=teste.index).copy()
    
    treino_dl = gerar_atributos_data(
        df=treino.drop(columns=colunas_vazao+colunas_chuva),
        atributos=atributos_data,
        col_data="ds"
    )

    prmtrs_rede = {
        "h": f,
        "random_seed": SEED,
        "context_size": look_back,
        "loss": HuberMQLoss(level=intervalos_previsao),
        "scaler_type": None,
        "logger": False,
        "alias": "LSTM",
        "max_steps": 1000,
        "early_stop_patience_steps": 5,
        "val_check_steps": 25,
        "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
        "enable_progress_bar": False,
    }

    modelo = LSTM(**prmtrs_rede)

    nf = NeuralForecast(
        models=[modelo],
        freq="D",
        local_scaler_type="minmax",
    )

    nf.fit(
        df=treino_dl,
        val_size=2*f,
    )

    df_resultado = pd.merge(
        left=nf.predict(),
        right=teste[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left",
    ).rename(columns={modelo.alias+"-median" : "LSTM"})

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="LSTM",
        fh=f,
        titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario1,
        salvar=SALVAR_PLOTS
    )

Seed set to 1984
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type          | Params
--------------------------------------------------
0 | loss            | HuberMQLoss   | 3     
1 | padder          | ConstantPad1d | 0     
2 | scaler          | TemporalNorm  | 0     
3 | hist_encoder    | LSTM          | 484 K 
4 | context_adapter | Linear        | 1.4 K 
5 | mlp_decoder     | MLP           | 2.2 K 
--------------------------------------------------
487 K  

In [50]:
# from tsfresh.feature_extraction.feature_calculators import number_peaks

# number_peaks(
#     x=df_resultado.LSTM,
#     n=1
# )

### Main model padrão com CV

In [51]:
for f in fh_v:
    tscv = TimeSeriesSplit(
        n_splits=n_folds,
        test_size=f
    )

    dados_validacao = gerar_atributos_data(
        df=df_dados.tail(f).copy(),
        atributos=atributos_data,
        col_data="ds"
    )
    dados_validacao = dados_validacao[["ds", "unique_id", "y"]]

    dados_cv = gerar_atributos_data(
        df=df_dados.drop(index=dados_validacao.index).copy(),
        atributos=atributos_data,
        col_data="ds"
    )
    dados_cv = dados_cv[["ds", "unique_id", "y"]]

    prmtrs_rede = {
        "h": f,
        "random_seed": SEED,
        "context_size": look_back,
        "loss": HuberMQLoss(level=intervalos_previsao),
        "scaler_type": None,
        "logger": False,
        "alias": "LSTM",
        "max_steps": 1000,
        "early_stop_patience_steps": 5,
        "val_check_steps": 25,
        "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
        "enable_progress_bar": False,
    }

    modelo = LSTM(**prmtrs_rede)

    nf = NeuralForecast(
        models=[modelo],
        freq="D",
        local_scaler_type="minmax",
    )

    # # #
    for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
        treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]
        nf.fit(df=treino_cv, val_size=2*f)
    # # #

    nf.fit(
        df=dados_cv,
        val_size=2*f
    )

    df_resultado = pd.merge(
        left=nf.predict(),
        right=dados_validacao[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left",
    ).rename(columns={modelo.alias+"-median" : "LSTM"})

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="LSTM",
        fh=f,
        titulo="CV: {md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario1,
        salvar=SALVAR_PLOTS
    )

Seed set to 1984
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type          | Params
--------------------------------------------------
0 | loss            | HuberMQLoss   | 3     
1 | padder          | ConstantPad1d | 0     
2 | scaler          | TemporalNorm  | 0     
3 | hist_encoder    | LSTM          | 484 K 
4 | context_adapter | Linear        | 1.4 K 
5 | mlp_decoder     | MLP           | 2.2 K 
--------------------------------------------------
487 K     Trainable params
3         Non-trainable params
487 K     Total params
1.950     Total estimated model params size (MB)
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, usi

In [52]:
# Fazendo plot de cada fold dentro da validação cruzada.

# f = 1
# data_inicio = '2022-10-01' # Isso é apenas pra geração dos gráficos, não interfere no treinamento e teste
# tscv = TimeSeriesSplit(
#     n_splits=n_folds,
#     test_size=f
# )

# dados_cv = gerar_atributos_data(
#     df=df_dados.copy(),
#     atributos=atributos_data,
#     col_data="ds"
# )
# dados_cv = dados_cv[["ds", "unique_id", "y"]]

# prmtrs_rede = {
#     "h": f,
#     "random_seed": SEED,
#     "context_size": look_back,
#     "loss": HuberMQLoss(level=intervalos_previsao),
#     "scaler_type": "minmax",
#     "logger": False,
#     "alias": "LSTM",
#     "max_steps": 1000,
#     "early_stop_patience_steps": 5,
#     "val_check_steps": 25,
#     "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#     "enable_progress_bar": False,
# }

# modelo = LSTM(**prmtrs_rede)

# nf = NeuralForecast(
#     models=[modelo],
#     freq="D"
# )

# # # #
# df_final = pd.DataFrame()
# for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
#     treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]

#     # ------ #
#     nf.fit(df=treino_cv, val_size=2*f)
#     preds = nf.predict()

#     df_resultado = pd.merge(
#         left=preds,
#         right=teste_cv[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     )
#     df_final = pd.concat([df_final, df_resultado])
#     # ------ #

#     # Agregando os dados num único DataFrame
#     df_trn = pd.DataFrame(treino_cv[['ds', 'unique_id', 'y']])
#     df_trn.reset_index(drop=True, inplace=True)

#     df_tst = pd.DataFrame(teste_cv[['ds', 'unique_id', 'y']])
#     df_tst.reset_index(drop=True, inplace=True)
#     df_tst[modelo.alias+"-median"] = preds[[modelo.alias+"-median"]].values
#     df_merged = pd.concat([df_trn, df_tst], axis=0)

#     plot_cv(
#         fh=f,
#         df_merged=df_merged,
#         df_resultado=df_resultado,
#         regressor=modelo.alias+"-median",
#         data_inicio=data_inicio,
#         titulo_plot="LSTM e CV (fold={}) (fh={}) ({})".format(fold+1, f, cenario1),
#         n_decimais=5,
#         pasta_dstn=pasta_resultados+cenario1,
#         salvar=SALVAR_PLOTS,        
#     )
# # # #
# df_final = df_final.reset_index(drop=True)

# tbl = go.Figure()
# tbl.add_trace(
#     go.Table(
#         header=dict(
#             values=[
#                 "SMAPE",
#                 "RMSE",
#                 "PBIAS (%)",
#                 "DRV",
#             ],
#             font=dict(size=18),
#             align="center"
#         ),
#         cells=dict(
#             values=[
#                 round(smape(df_final.y, df_final[modelo.alias+"-median"]), 5),
#                 round(rmse(df_final.y, df_final[modelo.alias+"-median"]), 5),
#                 round(percentage_bias(df_final.y, df_final[modelo.alias+"-median"]), 5),
#                 round(deviation_runoff_volume(df_final.y, df_final[modelo.alias+"-median"]), 5),
#             ],
#             font=dict(size=18),
#             height=24,
#             align="left",
#         ),
#     ),
# )

# tbl.update_layout(
#         width=1500,
#         height=500,
#         plot_bgcolor="#c8d4e3",
#         title=dict(
#             text="Tabela de métricas resumida",
#             font=dict(
#                 family="system-ui",
#                 size=30
#             )
#         ),
#     )
# tbl.show()

# Séries com Chuva, sem Vazão

Tem as features exógenas apenas de chuva, sem dados de estações de vazão à montante.

## StatsForecast

### Baseline - SeasonalNaive

Isso não é um preditor de fato. Ao menos, não se considera assim. Serve como uma baseline a superar.

In [53]:
for f in fh_v:
    teste = df_dados.tail(f).copy()
    treino = df_dados.drop(index=teste.index).copy()

    modelo = SeasonalNaive(season_length=365)

    stfc = StatsForecast(
        df=treino.drop(columns=colunas_vazao),
        models=[modelo],
        freq="D",
        n_jobs=8
    )

    df_futr = teste.drop(columns=['y'] + colunas_vazao).copy()

    previsoes = stfc.forecast(
        h=f,
        X_df=df_futr,
        level=intervalos_previsao
    )

    df_resultado = pd.merge(
        left=previsoes,
        right=teste[['ds', 'unique_id', 'y']],
        how="left",
        on=['ds', 'unique_id']
    )

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="SN",
        fh=f,
        titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario2),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario2,
        salvar=False, #SALVAR_PLOTS
    )

## MLForecast

### Baseline - Decision Tree

O emprego de Árvore de Decisão deve-se à característica do modelo em ser agnóstico à escala dos dados.

In [54]:
for f in fh_v:
    teste = df_dados.tail(f).copy()
    treino = df_dados.drop(index=teste.index).copy()

    dt = DecisionTreeRegressor(random_state=SEED)

    fcst = mlf.MLForecast(
        models=[dt],
        freq="D",
        lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
        num_threads=8,
        date_features=atributos_data,
    )

    fcst.fit(
        df=treino.drop(columns=colunas_vazao),
        id_col="unique_id",
        time_col="ds",
        target_col="y",
        static_features=[],
        prediction_intervals=PredictionIntervals(h=f),
    )

    df_futr = teste.drop(columns=['y'] + colunas_vazao).copy()

    previsoes = fcst.predict(
        h=f,
        X_df=df_futr,
        level=intervalos_previsao,
    )

    df_resultado = pd.merge(
        left=previsoes,
        right=teste[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left"
    )

    df_resultado.rename(
        columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
        inplace=True,
    )

    plot_resultados(
        df_merged=df_resultado,
        modelo="DecisionTree",
        nome_curto="DT",
        fh=f,
        titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario2),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario2,
        salvar=SALVAR_PLOTS
    )

## NeuralForecast

### Main model - LSTM

Este é o modelo que se pretende aplicar no trabalho

In [55]:
for f in fh_v:
    teste = df_dados.tail(f).copy()
    treino = df_dados.drop(index=teste.index).copy()
    
    treino_dl = gerar_atributos_data(
        df=treino.drop(columns=colunas_vazao),
        atributos=atributos_data,
        col_data="ds"
    )

    teste_dl = gerar_atributos_data(
        df=teste.drop(columns=colunas_vazao),
        atributos=atributos_data,
        col_data="ds"
    )

    prmtrs_rede = {
        "h": f,
        "random_seed": SEED,
        "context_size": look_back,
        "loss": HuberMQLoss(level=intervalos_previsao),
        "futr_exog_list": colunas_chuva + atributos_data,
        "scaler_type": None,
        "logger": False,
        "alias": "LSTM",
        "max_steps": 1000,
        "early_stop_patience_steps": 5,
        "val_check_steps": 25,
        "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
        "enable_progress_bar": False,
    }

    modelo = LSTM(**prmtrs_rede)

    nf = NeuralForecast(
        models=[modelo],
        freq="D",
        local_scaler_type="minmax",
    )

    nf.fit(
        df=treino_dl,
        val_size=2*f,
    )

    df_resultado = pd.merge(
        left=nf.predict(futr_df=teste_dl.drop(columns=['y'])),
        right=teste_dl[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left",
    ).rename(columns={modelo.alias+"-median" : "LSTM"})

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="LSTM",
        fh=f,
        titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario2),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario2,
        salvar=SALVAR_PLOTS
    )

Seed set to 1984
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type          | Params
--------------------------------------------------
0 | loss            | HuberMQLoss   | 3     
1 | padder          | ConstantPad1d | 0     
2 | scaler          | TemporalNorm  | 0     
3 | hist_encoder    | LSTM          | 484 K 
4 | context_adapter | Linear        | 1.5 K 
5 | mlp_decoder     | MLP           | 3.8 K 
--------------------------------------------------
489 K     Trainable params
3         Non-trainable params
489 K     Total params
1.957     Total estimated model params size (MB)
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, usi

### Main model padrão com CV

In [56]:
for f in fh_v:
    tscv = TimeSeriesSplit(
        n_splits=n_folds,
        test_size=f
    )

    #   Este DataFrame servirá apenas para verificar se usando CV o modelo ajustará os pesos na moral
    # e como se comportará com dados novos nunca vistos antes.
    dados_validacao = gerar_atributos_data(
        df=df_dados.tail(f).copy(),
        atributos=atributos_data,
        col_data="ds"
    ).drop(columns=colunas_vazao)

    # Com estes dados restantes eu realizo a validação cruzada.
    dados_cv = gerar_atributos_data(
        df=df_dados.drop(index=dados_validacao.index).copy(),
        atributos=atributos_data,
        col_data="ds"
    ).drop(columns=colunas_vazao)

    prmtrs_rede = {
        "h": f,
        "random_seed": SEED,
        "context_size": look_back,
        "loss": HuberMQLoss(level=intervalos_previsao),
        "futr_exog_list": colunas_chuva + atributos_data,
        "scaler_type": None,
        "logger": False,
        "alias": "LSTM",
        "max_steps": 1000,
        "early_stop_patience_steps": 5,
        "val_check_steps": 25,
        "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
        "enable_progress_bar": False,
    }

    modelo = LSTM(**prmtrs_rede)

    nf = NeuralForecast(
        models=[modelo],
        freq="D",
        local_scaler_type="minmax",
    )

    # # #
    for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
        treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]
        nf.fit(df=treino_cv, val_size=2*f)
    # # #
    
    nf.fit(
        df=dados_cv,
        val_size=2*f,
    )

    df_resultado = pd.merge(
        left=nf.predict(futr_df=dados_validacao.drop(columns=['y'])),
        right=dados_validacao[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left",
    ).rename(columns={modelo.alias+"-median" : "LSTM"})

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="LSTM",
        fh=f,
        titulo="CV: {md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario2),
        niveis=intervalos_previsao_plotar,
        cores=["green", "blue"],
        pasta_dstn=pasta_resultados+cenario2,
        salvar=SALVAR_PLOTS
    )

Seed set to 1984
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type          | Params
--------------------------------------------------
0 | loss            | HuberMQLoss   | 3     
1 | padder          | ConstantPad1d | 0     
2 | scaler          | TemporalNorm  | 0     
3 | hist_encoder    | LSTM          | 484 K 
4 | context_adapter | Linear        | 1.5 K 
5 | mlp_decoder     | MLP           | 3.8 K 
--------------------------------------------------
489 K     Trainable params
3         Non-trainable params
489 K     Total params
1.957     Total estimated model params size (MB)
Trainer already configured with model summary callbacks: [<class 'pytorch_lightning.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, usi

# Séries com Chuva, com Vazão

## StatsForecast

### Baseline - SeasonalNaive

Isso não é um preditor de fato. Ao menos, não se considera assim. Serve como uma baseline a superar.

In [57]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     modelo = SeasonalNaive(season_length=365)

#     stfc = StatsForecast(
#         df=treino,
#         models=[modelo],
#         freq="D",
#         n_jobs=8
#     )

#     df_futr = cria_dataframe_futuro(
#         df_futr=fcst.make_future_dataframe(h=f),
#         df_train=treino,
#         df_test=teste,
#         tp_valor='ml',
#         n_lags=look_back,
#         date_features=atributos_data,
#         cols=colunas_vazao
#     )
    
#     previsoes = stfc.forecast(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[['ds', 'unique_id', 'y']],
#         how="left",
#         on=['ds', 'unique_id']
#     )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="SN",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=False, #SALVAR_PLOTS
#     )

## MLForecast

### Baseline - Decision Tree

O emprego de Árvore de Decisão deve-se à característica do modelo em ser agnóstico à escala dos dados.

In [58]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     dt = DecisionTreeRegressor(random_state=SEED)

#     fcst = mlf.MLForecast(
#         models=[dt],
#         freq="D",
#         lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
#         num_threads=8,
#         date_features=atributos_data,
#     )

#     fcst.fit(
#         df=treino,
#         id_col="unique_id",
#         time_col="ds",
#         target_col="y",
#         static_features=[],
#         prediction_intervals=PredictionIntervals(h=f),
#     )

#     df_futr = cria_dataframe_futuro(
#         df_futr=fcst.make_future_dataframe(h=f),
#         df_train=treino,
#         df_test=teste,
#         tp_valor='ml',
#         n_lags=look_back,
#         date_features=atributos_data,
#         cols=colunas_vazao
#     )

#     previsoes = fcst.predict(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao,
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left"
#     )

#     df_resultado.rename(
#         columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
#         inplace=True,
#     )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo="DecisionTree",
#         nome_curto="DT",
#         fh=f,
#         titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=SALVAR_PLOTS
#     )

## NeuralForecast

### Main model - LSTM

Este é o modelo que se pretende aplicar no trabalho

In [59]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()
    
#     treino_dl = gerar_atributos_data(
#         df=treino,
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     teste_dl = gerar_atributos_data(
#         df=teste,
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "scaler_type": "minmax",
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D"
#     )

#     nf.fit(
#         df=treino_dl,
#         val_size=2*f,
#     )

#     # df_futr_dl = cria_dataframe_futuro(
#     #     df_futr=nf.make_future_dataframe(),
#     #     df_train=treino_dl,
#     #     df_test=teste_dl,
#     #     tp_valor='ml',
#     #     n_lags=look_back,
#     #     date_features=atributos_data,
#     #     cols=colunas_vazao
#     # )

#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=teste_dl.drop(columns=['y']+colunas_vazao)),
#         # left=nf.predict(futr_df=df_futr_dl),
#         right=teste_dl[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=SALVAR_PLOTS
#     )

### Main model padrão com CV

In [60]:
# for f in fh_v:
#     tscv = TimeSeriesSplit(
#         n_splits=n_folds,
#         test_size=f
#     )

#     #   Este DataFrame servirá apenas para verificar se usando CV o modelo ajustará os pesos na moral
#     # e como se comportará com dados novos nunca vistos antes.
#     dados_validacao = gerar_atributos_data(
#         df=df_dados.tail(f).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     # Com estes dados restantes eu realizo a validação cruzada.
#     dados_cv = gerar_atributos_data(
#         df=df_dados.drop(index=dados_validacao.index).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "scaler_type": "minmax",
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D"
#     )

#     # # #
#     for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
#         treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]
#         nf.fit(df=treino_cv, val_size=2*f)
#     # # #
#     # Com o modelo "fittado", faço a previsão em cima do conjunto de validação

#     nf.fit(
#         df=dados_cv,
#         val_size=2*f,
#     )

#     # futr_dl = cria_dataframe_futuro(
#     #     df_futr=nf.make_future_dataframe(),
#     #     df_train=dados_cv,
#     #     df_test=dados_validacao,
#     #     tp_valor='ml',
#     #     n_lags=look_back,
#     #     date_features=atributos_data,
#     #     cols=colunas_vazao
#     # )

#     df_resultado = pd.merge(
#         # left=nf.predict(futr_df=futr_dl),
#         left=nf.predict(futr_df=dados_validacao.drop(columns=['y']+colunas_vazao)),
#         right=dados_validacao[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     )

#     df_resultado = df_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="CV: {md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=SALVAR_PLOTS
#     )

# Optuna e LSTM

In [61]:
# # Função-objetivo para o Optuna
# def opt_lstm(trial, fh, treino, teste, dir):
#     fixo_prmtrs = {
#         "h": fh,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "random_seed": SEED,
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 250,
#         "context_size": look_back,
#         "early_stop_patience_steps": 3,
#         "val_check_steps": 7,
#         "scaler_type": None,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     opt_prmtrs = {
#         "encoder_n_layers": trial.suggest_int("encoder_n_layers", 1, 5),
#         "encoder_hidden_size": trial.suggest_int("encoder_hidden_size", 8, 256, step=2),
#         "encoder_dropout" : trial.suggest_float("encoder_dropout", 0.05, 0.25),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-1),
#     }

#     modelo = LSTM(
#         **fixo_prmtrs,
#         **opt_prmtrs
#     )

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D",
#         local_scaler_type="minmax",
#     )

#     nf.fit(
#         df=treino,
#         val_size=2*fh,
#     )

#     df_futr_dl = cria_dataframe_futuro(
#         df_futr=nf.make_future_dataframe(),
#         df_train=treino,
#         df_test=teste,
#         tp_valor='ml',
#         n_lags=look_back,
#         date_features=atributos_data,
#         cols=colunas_vazao
#     )

#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=df_futr_dl),
#         right=teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     )
   
#     try:
#         loss = smape(df_resultado["y"], df_resultado[modelo.alias+'-median'])
#     except ValueError:
#         loss = 1e+3

#     return loss
# #################################################
# #################################################
# dir_base = pasta_resultados+cenario3+"/lstm_opt/"
# if not os.path.exists(dir_base):
#     os.makedirs(dir_base)

# # for i in range(0, 1):
#     # dir_final = os.path.join(dir_base, 'exec_%s' % str(i+1))
    
#     # # Cria os diretorios dinamicamente
#     # if not os.path.exists(dir_final):
#     #     os.makedirs(dir_final)

# # Guardar informações apenas das melhores trials
# lstm_best_trial = {}

# # Executando e reproduzinho a otimização para o horizonte de previsão
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     treino_dl = gerar_atributos_data(
#         df=treino,
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     teste_dl = gerar_atributos_data(
#         df=teste,
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     study_lstm = opt.create_study(
#         sampler=opt.samplers.TPESampler(seed=SEED),
#         study_name="opt_lstm",
#         direction="minimize",
#     )

#     opt_lstm = partial(
#         opt_lstm,
#         fh=f,
#         treino=treino_dl,
#         teste=teste_dl,
#         dir=dir_base,
#     )

#     study_lstm.optimize(
#         func=opt_lstm,
#         n_trials=100,
#         catch=(FloatingPointError, ValueError, RuntimeError),
#         show_progress_bar=True,
#         n_jobs=1,
#     )

#     lstm_best_trial[fh_v.index(f)] = {
#         'modelo' : 'LSTM',
#         'fh' : f,
#         'best_trial' : study_lstm.best_trial.number,
#         'best_value' : study_lstm.best_value,
#         'best_params' : study_lstm.best_params.copy()
#     }

# # Salvar o dicionário "lstm_best_trial" para analisar mais tarde
# exportar_dict_json(
#     v_dict=lstm_best_trial,
#     pasta=dir_base,
#     nome_arq="lstm_best_trial.json"
# )

# Optuna e Cross-Validation

In [62]:
# def opt_lstm(trial, fh, n_folds, dataset):
#     tscv = TimeSeriesSplit(
#         n_splits=n_folds,
#         test_size=fh
#     )

#     fixo_prmtrs = {
#         "h": fh,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "random_seed": SEED,
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "context_size": look_back,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "scaler_type": None,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     opt_prmtrs = {
#         "encoder_n_layers": trial.suggest_int("encoder_n_layers", 1, 6),
#         "encoder_hidden_size": trial.suggest_int("encoder_hidden_size", 8, 256, step=2),
#         "encoder_dropout" : trial.suggest_float("encoder_dropout", 0.05, 0.25),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-1),
#     }

#     losses = []
#     for _, (treino_ind, teste_ind) in enumerate(tscv.split(dataset)):
#         treino, teste = dataset.iloc[treino_ind], dataset.iloc[teste_ind]

#         modelo = LSTM(
#             **fixo_prmtrs,
#             **opt_prmtrs
#         )

#         nf = NeuralForecast(
#             models=[modelo],
#             freq="D",
#             local_scaler_type="minmax",
#         )

#         nf.fit(
#             df=treino,
#             val_size=2*fh,
#         )

#         df_futr_dl = cria_dataframe_futuro(
#             df_futr=nf.make_future_dataframe(),
#             df_train=treino,
#             df_test=teste,
#             tp_valor='ml',
#             n_lags=look_back,
#             date_features=atributos_data,
#             cols=colunas_vazao
#         )

#         df_resultado = pd.merge(
#             left=nf.predict(futr_df=df_futr_dl),
#             right=teste[["ds", "unique_id", "y"]],
#             on=["ds", "unique_id"],
#             how="left",
#         )
        
#         try:
#             losses.append(smape(df_resultado["y"], df_resultado[modelo.alias+'-median']))
#         except ValueError:
#             losses.append(1e+3)

#     # Salvando cada modelo gerado e depois vou carregar o modelo que melhor se saiu
#     # with open(dir+"/lstm_opt_cv_trial-{tnum}.pickle".format(tnum=trial.number), "wb") as mod_out:
#     #     pickle.dump(modelo, mod_out)

#     return np.mean(losses)
# #################################################
# dados_dl = gerar_atributos_data(
#     df=df_dados,
#     atributos=atributos_data,
#     col_data="ds"
# )

# dir_final = pasta_resultados+cenario3+"/lstm_opt_cv/"
# if not os.path.exists(dir_final):
#     os.makedirs(dir_final)

# # Guardar informações apenas das melhores trials
# lstm_best_trial = {}

# for f in fh_v:
#     # Executando e reproduzinho a otimização para o horizonte de previsão
#     study_lstm = opt.create_study(
#         sampler=opt.samplers.TPESampler(seed=SEED),
#         study_name="lstm_opt_cv",
#         direction="minimize",
#     )

#     opt_lstm = partial(
#         opt_lstm,
#         fh=f,
#         n_folds=n_folds,
#         dataset=dados_dl
#     )

#     study_lstm.optimize(
#         func=opt_lstm,
#         n_trials=100,
#         catch=(FloatingPointError, ValueError, RuntimeError),
#         show_progress_bar=False,
#         n_jobs=1,
#     )

#     lstm_best_trial[fh_v.index(f)] = {
#         'modelo' : 'LSTM-CV',
#         'fh': f,
#         'best_trial' : study_lstm.best_trial.number,
#         'best_value' : study_lstm.best_value,
#         'best_params' : study_lstm.best_params.copy()
#     }

# # Salvar o dicionário "lstm_best_trial" para analisar mais tarde
# exportar_dict_json(
#     v_dict=lstm_best_trial,
#     pasta=dir_final,
#     nome_arq="lstm-cv_best_trials.json"
# )

# Simulando o uso

Neste ponto, o que se busca é reproduzir o uso diário que o trabalho (provavelmente) teria no IGAM.

Ao longo do ano de 2023 inteiro o modelo, otimizado, será executado. Será um "fit/predict" diário, dentro de um loop, para ver o comportamento do modelo ao longo do tempo. Se degenerará, em quanto tempo isso ocorrerá e outras análises também que se fizerem pertinentes.

Pois bem. O procedimento será:

- Otimizar a escolha de hiperparâmetros para a rede.
- Usando validação cruzada combinada com Optuna na massa de dados de 2013 a 2022, perfazendo 10 anos. O horizonte de previsão será sempre de 1 dia. O número de 'folds' será deliberado em 5 folds.

## Walk-forward Validation com janela expandida

In [63]:
def opt_lstm_cv(trial, fh, n_folds, dataset):
    tscv = TimeSeriesSplit(
        n_splits=n_folds,
        test_size=fh
    )

    fixo_prmtrs = {
        "h": fh,
        # "loss": HuberMQLoss(level=intervalos_previsao),
        "random_seed": SEED,
        # "hist_exog_list": colunas_vazao,
        "futr_exog_list": colunas_chuva + atributos_data,
        "logger": False,
        "alias": "LSTM",
        "max_steps": 200,
        "context_size": look_back,
        "scaler_type": None,
        "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
        "enable_progress_bar": False,
    }

    opt_prmtrs = {
        "encoder_n_layers": trial.suggest_int("encoder_n_layers", 1, 6),
        "encoder_hidden_size": trial.suggest_int("encoder_hidden_size", 8, 256, step=4),
        "encoder_dropout" : trial.suggest_float("encoder_dropout", 0.05, 0.25),
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-1),
    }

    erros = []
    for _, (treino_ind, teste_ind) in enumerate(tscv.split(dataset)):
        treino, teste = dataset.iloc[treino_ind], dataset.iloc[teste_ind]

        modelo = LSTM(
            **fixo_prmtrs,
            **opt_prmtrs
        )

        nf = NeuralForecast(
            models=[modelo],
            freq="D",
            local_scaler_type="minmax",
        )

        nf.fit(df=treino)

        # df_futr_dl = cria_dataframe_futuro(
        #     df_futr=nf.make_future_dataframe(),
        #     df_train=treino,
        #     df_test=teste,
        #     tp_valor='ml',
        #     n_lags=look_back,
        #     date_features=atributos_data,
        #     cols=colunas_vazao
        # )

        df_resultado = pd.merge(
            # left=nf.predict(futr_df=df_futr_dl),
            # left=nf.predict(futr_df=teste.drop(columns=["y"]+colunas_vazao)),
            left=nf.predict(futr_df=teste.drop(columns=["y"])),
            right=teste[["ds", "unique_id", "y"]],
            on=["ds", "unique_id"],
            how="left",
        )
        
        try:
            # erros.append(smape(df_resultado["y"], df_resultado[modelo.alias+'-median']))
            erros.append(smape(df_resultado["y"], df_resultado[modelo.alias]))
        except ValueError:
            erros.append(1e+3)

    return np.mean(erros)
#################################################
dados_dl = gerar_atributos_data(
    df=df_dados.drop(columns=colunas_vazao),
    atributos=atributos_data,
    col_data="ds"
)

dir_final = pasta_resultados+cenario3+"/lstm_opt_cv/"
if not os.path.exists(dir_final):
    os.makedirs(dir_final)

nome_estudo="rio_doce_trecho_alto"
local_armaz="sqlite:///{}.db".format(nome_estudo)
lstmcv_best_trial = {}
f = 1

# Cria o estudo e armazenará localmente para poder retornar depois
study_lstm_cv = opt.create_study(
    sampler=opt.samplers.TPESampler(seed=SEED),
    study_name=nome_estudo,
    direction="minimize",
    storage=local_armaz,
)

opt_lstm_cv = partial(
    opt_lstm_cv,
    fh=f,
    n_folds=n_folds,
    dataset=dados_dl
)

study_lstm_cv.optimize(
    func=opt_lstm_cv,
    n_trials=100,
    catch=(FloatingPointError, ValueError, RuntimeError),
    show_progress_bar=True,
    n_jobs=1,
)

lstmcv_best_trial = {
    'modelo' : 'LSTM-CV',
    'fh': f,
    'best_trial' : study_lstm_cv.best_trial.number,
    'best_value' : study_lstm_cv.best_value,
    'best_params' : study_lstm_cv.best_params.copy()
}

# Salvar o dicionário "lstmcv_best_trial" para analisar mais tarde
exportar_dict_json(
    v_dict=lstmcv_best_trial,
    pasta=dir_final,
    nome_arq="lstm-cv_best_trial_fh{}.json".format(f)
)

  0%|          | 0/100 [00:00<?, ?it/s]

Seed set to 1984
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type          | Params
--------------------------------------------------
0 | loss            | MAE           | 0     
1 | padder          | ConstantPad1d | 0     
2 | scaler          | TemporalNorm  | 0     
3 | hist_encoder    | LSTM          | 26.6 K
4 | context_adapter | Linear        | 623   
5 | mlp_decoder     | MLP           | 3.4 K 
--------------------------------------------------
30.6 K    Trainable params
0         Non-trainable params
30.6 K    Total params
0.122     Total estimated model params size (MB)
`Trainer.fit` stopped: `max_steps=200` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

In [64]:
# Procedimento para carregar o objeto "study_lstm_cv" (previamente salvo já) para realizar análises

estudo = opt.create_study(
    study_name=nome_estudo,
    storage=local_armaz,
    load_if_exists=True
)

In [65]:
opt.visualization.plot_param_importances(estudo)

In [66]:
estudo.best_params

{'encoder_n_layers': 1,
 'encoder_hidden_size': 220,
 'encoder_dropout': 0.2040758975029051,
 'learning_rate': 0.0008173428080548387}

In [67]:
treino_dl = gerar_atributos_data(
    df=df_dados.drop(columns=colunas_vazao),
    atributos=atributos_data,
    col_data="ds"
)

teste_2023 = gerar_atributos_data(
    df=df_2023.drop(columns=colunas_vazao),
    atributos=atributos_data,
    col_data="ds"
)

parametros = {
    "h": 1,
    # "loss": HuberMQLoss(level=intervalos_previsao),
    "random_seed": SEED,
    # "hist_exog_list": colunas_vazao,
    "futr_exog_list": colunas_chuva + atributos_data,
    "logger": False,
    "alias": "LSTM",
    "max_steps": 200,
    "context_size": look_back,
    "scaler_type": None,
    "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
    "enable_progress_bar": False,
}

# Parâmetros ótimos encontrados para fh=1
melhores_parametros = estudo.best_params.copy()

modelo = LSTM(
    **melhores_parametros,
    **parametros,
)

nf = NeuralForecast(
    models=[modelo],
    freq="D",
    local_scaler_type="minmax",
)

nf.fit(df=treino_dl)

# df_futr_dl = cria_dataframe_futuro(
#     df_futr=nf.make_future_dataframe(),
#     df_train=treino_dl,
#     df_test=teste_2023,
#     tp_valor='ml',
#     n_lags=look_back,
#     date_features=atributos_data,
#     cols=colunas_vazao
# )

df_resultado = pd.merge(
    # left=nf.predict(futr_df=df_futr_dl),
    # left=nf.predict(futr_df=teste_2023.drop(columns=["y"]+colunas_vazao)),
    left=nf.predict(futr_df=teste_2023.drop(columns=["y"])),
    right=teste_2023[["ds", "unique_id", "y"]],
    on=["ds", "unique_id"],
    how="left",
)
# df_resultado = df_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

novo_treino = pd.concat([pd.DataFrame(columns=treino_dl.columns), treino_dl])
novo_teste = pd.DataFrame(columns=teste_2023.columns)
df_final = df_resultado.copy()

for i in range(len(teste_2023) - 1):
    novo_treino = pd.concat([novo_treino, teste_2023.iloc[[i]]])
    novo_teste = teste_2023.iloc[[i + 1]]

    nf.fit(df=novo_treino)

    # novo_df_futr = cria_dataframe_futuro(
    #     df_futr=nf.make_future_dataframe(),
    #     df_train=novo_treino,
    #     df_test=novo_teste,
    #     tp_valor='ml',
    #     n_lags=look_back,
    #     date_features=atributos_data,
    #     cols=colunas_vazao
    # )

    novo_resultado = pd.merge(
        # left=nf.predict(futr_df=novo_df_futr),
        # left=nf.predict(futr_df=novo_teste.drop(columns=["y"]+colunas_vazao)),
        left=nf.predict(futr_df=novo_teste.drop(columns=["y"])),
        right=novo_teste[["ds", "unique_id", "y"]],
        on=["ds", "unique_id"],
        how="left",
    )
    # novo_resultado = novo_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

    df_final = pd.concat([df_final, novo_resultado])

Seed set to 1984
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name            | Type          | Params
--------------------------------------------------
0 | loss            | MAE           | 0     
1 | padder          | ConstantPad1d | 0     
2 | scaler          | TemporalNorm  | 0     
3 | hist_encoder    | LSTM          | 196 K 
4 | context_adapter | Linear        | 1.6 K 
5 | mlp_decoder     | MLP           | 3.4 K 
--------------------------------------------------
201 K     Trainable params
0         Non-trainable params
201 K     Total params
0.805     Total estimated model params size (MB)
`Trainer.fit` stopped: `max_steps=200` reached.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

In [None]:
df_final

In [None]:
# from tsfresh.feature_extraction.feature_calculators import number_peaks

# number_peaks(
#     x=df_final.LSTM,
#     n=3
# )

In [None]:
# Ideia retirada de: <https://plotly.com/python/ml-regression/#simple-actual-vs-predicted-plot>

salvar = SALVAR_PLOTS

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=df_final["LSTM"],
        y=df_final["y"],
        mode="markers",
        line=dict(color="blue"),
        hovertemplate="previsão: %{x}<br>observado: %{y}</br><extra></extra>",
        showlegend=False,
    )
)

fig.add_shape(
    type="line",
    line=dict(
        color='red',
        dash='dash'
    ),
    x0=df_final["y"].min(), y0=df_final["y"].min(),
    x1=df_final["y"].max(), y1=df_final["y"].max()
)

fig.update_xaxes(
    title=dict(
        text="Previsão (m³/s)",
        font=dict(
            family="system-ui",
            size=18
        )
    ),
    zerolinecolor="black",
    showspikes=True,
    mirror=True,
    ticks="outside",
    showline=True,
    linecolor="black",
)

fig.update_yaxes(
    title=dict(
        text="Observado (m³/s)",
        font=dict(
            family="system-ui",
            size=18
        )
    ),
    zerolinecolor="black",
    showspikes=True,
    mirror=True,
    ticks="outside",
    showline=True,
    linecolor="black",
)

fig.update_layout(
    width=1500,
    height=700,
    hovermode="closest",
    plot_bgcolor="#c8d4e3",
    title=dict(
        text="Erro de previsão",
        font=dict(
            family="system-ui",
            size=24
        ),
    ),
)

if salvar:
    fig.write_image(
        pasta_resultados+cenario3+"relacao_observado_previsao.png"
    )
else:
    fig.show()

In [None]:
plot_resultados(
    df_merged=df_final.dropna(),
    modelo="LSTM",
    nome_curto="LSTM",
    fh=1,
    titulo="Resultado final do experimento de caso de uso",
    pasta_dstn=pasta_resultados,
    niveis=None,
    cores=None,
    salvar=SALVAR_PLOTS,
    n_decimal=5,
    metricas="hidrologia",
    marcadores=False
)

# FIM