# Imports básicos para todas as análises

In [71]:
# import os
# Verificando se isso aqui resolve o CUDAOutOfMemory 
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# import pickle
# import hydroeval as hev
# from permetrics import RegressionMetric

import os, re, json, torch, warnings

import                  \
    mlforecast as mlf,  \
    numpy as np,        \
    optuna as opt,      \
    pandas as pd,       \
    plotly.graph_objects as go, \
    plotly.express as px

from datetime import datetime
from functools import partial
from sklearn.model_selection import TimeSeriesSplit
from plotly.subplots import make_subplots

from statsforecast import StatsForecast
from statsforecast.models import SeasonalNaive

from mlforecast.utils import PredictionIntervals
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression

from skforecast.model_selection import grid_search_forecaster, bayesian_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.model_selection import select_features
from skforecast.ForecasterAutoreg import ForecasterAutoreg

from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM
from neuralforecast.losses.pytorch import HuberMQLoss

# A ser usado apenas para a análise de imputação de dados (ao invés de sempre aplicar o valor médio)
from sklearn.impute import KNNImputer

from sktime.param_est.seasonality import SeasonalityACF
from sktime.param_est.stationarity import StationarityADF
from sktime.performance_metrics.forecasting import (
    MeanAbsolutePercentageError,
    MeanSquaredError,
)

# from sktime.split import temporal_train_test_split
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import (
    acf,
    pacf
)

# Desativar as mensagens de 'warning' que ficam poluindo o output de alguns trechos de código.
warnings.filterwarnings("ignore")

# Para com a verborragia do log do Optuna
opt.logging.set_verbosity(opt.logging.WARNING)

# Wraper pra usar a engine do Plotly ao invocar a função "[DataFrame|Series].plot" do Pandas
pd.options.plotting.backend = "plotly"

# Reduz a precisão na multiplicação de matrizes, mas aumenta o desempenho e consome menos memória da GPU
# <https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision>
torch.set_float32_matmul_precision("highest")

# Métricas utilizadas
mape = MeanAbsolutePercentageError(symmetric=False) # Melhor valor possível é 0.0
rmse = MeanSquaredError(square_root=True) # Quanto menor, melhor

SALVAR_PLOTS = True
SEED = 1984

pasta_resultados = "./resultados/trecho_baixo/"

# Dicionário com as datas de início e término das estações
estacoes = {
    'verao': [(pd.Timestamp('12-21'), pd.Timestamp('03-20'))],
    'outono': [(pd.Timestamp('03-21'), pd.Timestamp('06-20'))],
    'inverno': [(pd.Timestamp('06-21'), pd.Timestamp('09-22'))],
    'primavera': [(pd.Timestamp('09-23'), pd.Timestamp('12-20'))]
}

# Mapeamento das estações para números
estacao_para_numero = {
    'verao': 1,
    'outono': 2,
    'inverno': 3,
    'primavera': 4
}

# Utilidades

In [2]:
# Métricas comumente aplicadas à Hidrologia

def kling_gupta_efficiency(y_true, y_pred):
    """
        Calcula a métrica Kling-Gupta Efficiency (KGE).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor da métrica KGE.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
    
    # Correlação linear
    r = np.corrcoef(y_true, y_pred)[0, 1]
    
    # Razão dos desvios padrão (beta)
    std_true = np.std(y_true, ddof=1)
    std_pred = np.std(y_pred, ddof=1)
    beta = std_pred / std_true
    
    # Razão das médias (gamma)
    mean_true = np.mean(y_true)
    mean_pred = np.mean(y_pred)
    gamma = mean_pred / mean_true
    
    # Cálculo do KGE
    kge = 1 - np.sqrt(
        ((r - 1) ** 2) +
        ((beta - 1) ** 2) +
        ((gamma - 1) ** 2)
    )
    
    return kge
##############################################################################################
def kling_gupta_efficiency_non_parametric(y_true, y_pred):
    """
        Código retirado de: <https://github.com/ThibHlln/hydroeval/blob/main/hydroeval/objective_functions.py>

        Calcula a métrica Kling-Gupta Efficiency não-paramétrica (KGEnp).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Traditional Kling-Gupta efficiencies (Gupta et al., 2009; Kling et al., 2012) range from -Inf to 1, and therefore KGEnp should do so.
        Essentially, the closer to 1, the more similar 'y_pred' and 'y_true' are.
        Knoben et al. (2019) showed that traditional Kling-Gupta (Gupta et al., 2009; Kling et al., 2012)
            values greater than -0.41 indicate that a model improves upon the mean flow benchmark, even if the model's KGE value is negative.

            Texto retirado de: <https://rdrr.io/cran/hydroGOF/man/KGEnp.html>

    """

    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred) 

    # calculate error in timing and dynamics r
    # (Spearman's correlation coefficient)
    sim_rank = np.argsort(np.argsort(y_pred, axis=0), axis=0)
    obs_rank = np.argsort(np.argsort(y_true, axis=0), axis=0)

    r_num = np.sum(
        (obs_rank - np.mean(obs_rank, axis=0, dtype=np.float64)) *
        (sim_rank - np.mean(sim_rank, axis=0, dtype=np.float64)),
        axis=0
    )

    r_den = np.sqrt(
        np.sum(
            (obs_rank - np.mean(obs_rank, axis=0, dtype=np.float64)) ** 2,
            axis=0
        ) *
        np.sum(
            (sim_rank - np.mean(sim_rank, axis=0, dtype=np.float64)) ** 2,
            axis=0
        )
    )

    r = r_num / r_den

    # calculate error in timing and dynamics alpha (flow duration curve)
    sim_fdc = np.sort(
        y_pred / (y_pred.shape[0] * np.mean(y_pred, axis=0, dtype=np.float64)),
        axis=0
    )

    obs_fdc = np.sort(
        y_true / (y_true.shape[0] * np.mean(y_true, axis=0, dtype=np.float64)),
        axis=0
    )

    alpha = 1 - 0.5 * np.sum(np.abs(sim_fdc - obs_fdc), axis=0)

    # calculate error in volume beta (bias of mean discharge)
    beta = (np.mean(y_pred, axis=0) / np.mean(y_true, axis=0, dtype=np.float64))

    # calculate the non-parametric Kling-Gupta Efficiency KGEnp
    kgenp = 1 - np.sqrt(
        ((r - 1) ** 2) +
        ((alpha - 1) ** 2) +
        ((beta - 1) ** 2)
    )

    return kgenp
##############################################################################################
def nash_sutcliffe_efficiency(y_true, y_pred):
    """
        Calcula a métrica Nash-Sutcliffe Efficiency (NSE).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor da métrica NSE.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
    
    # Média dos valores observados
    mean_y_true = np.mean(y_true)
    
    # Soma dos quadrados dos resíduos
    ss_res = np.sum((y_true - y_pred) ** 2)
    
    # Soma dos quadrados totais
    ss_tot = np.sum((y_true - mean_y_true) ** 2)
    
    # Cálculo do NSE
    nse = 1 - (ss_res / ss_tot)
    
    return nse
##############################################################################################
def coefficient_determination(y_true, y_pred):
    """
        Calcula o Coeficiente de Determinação (R²).
        Maior é melhor (Ótimo = 1)
        Limites=(-inf, 1]

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor do R².
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
    
    # Média dos valores observados
    mean_y_true = np.mean(y_true)
    
    # Soma dos quadrados dos resíduos
    ss_res = np.sum((y_true - y_pred) ** 2)
    
    # Soma dos quadrados totais
    ss_tot = np.sum((y_true - mean_y_true) ** 2)
    
    # Cálculo do R²
    r2 = 1 - (ss_res / ss_tot)
    
    return r2
##############################################################################################
def percentage_bias(y_true, y_pred):
    """
        O Viés Percentual - Percentual Bias (PBIAS) mede a tendência média dos valores simulados de serem maiores ou menores que os observados.
        O valor ideal de PBIAS é 0.0, com valores de baixa magnitude indicando simulação precisa do modelo.
        Valores positivos indicam viés de *SUPERESTIMAÇÃO*
        Valores negativos indicam viés de *SUBESTIMAÇÃO*

        Fonte: <https://search.r-project.org/CRAN/refmans/hydroGOF/html/pbias.html>

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor do PBIAS em porcentagem.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
  
    pbias = (np.sum(y_pred - y_true) / np.sum(y_true)) * 100
    
    return pbias
##############################################################################################
def deviation_runoff_volume(y_true, y_pred):
    """
        Calcula o Deviation of the Runoff Volumes
        Valor próximo de 1.0 indica que o modelo está indo bem nas previsões

        Parâmetros:
        y_true (array-like): Valores observados.
        y_pred (array-like): Valores previstos.

        Retorna:
        float: Valor do DRV.
    """
    if not isinstance(y_true, np.ndarray):
        y_true = np.array(y_true)

    if not isinstance(y_pred, np.ndarray):
        y_pred = np.array(y_pred)  
  
    drv = np.sum(y_pred) / np.sum(y_true)
    
    return drv

In [86]:
def plot_serie_temporal(
    dataset: pd.DataFrame,
    coluna : str,
    tp_coluna : str,
    plot_title: str,
    line_color: str,
    short_name: str,
    pasta_resultados: str = "./",
    salvar: bool = False,
) -> None:
    """
        Método para desenhar o gráfico da Série Temporal completa.

        Parâmetros:
            dataset: o DataFrame com os dados da série temporal para desenhar o gráfico

            coluna: a coluna para a qual se deseja gerar o gráfico

            tp_coluna: é um campo do tipo booleano que faz a distinção entre "vazão" e "chuva" (os dois únicos tipos de séries temporais que temos)
                True -> significa "vazão"
                False -> significa "chuva"

            plot_title: uma string com um título para o gráfico

            line_color: uma cor desejada para a linha desenhada da série temporal

            short_name: uma string que será usada para escrever a legenda do gráfico
            
            salvar: se deverá salvar o gráfico em disco (quando True) ou desenhar na tela (quando False)
    """

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=dataset["ds"],
            y=dataset[coluna],
            mode="lines",
            name=short_name,
            line=dict(
                color=line_color,
                width=2
            ),
        ),
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s)" if tp_coluna == "vazao" else "Precipitação (mm/dia)",
            font=dict(
                family="system-ui",
                size=18
            )
        ),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(
                family="system-ui",
                size=18
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=1000,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text=plot_title,
            font=dict(
                family="system-ui",
                size=24
            )
        ),
    )

    if salvar:
        now = datetime.now()
        fig.write_image(
            pasta_resultados+"SérieTemporal_col[{col}]_{dt}.png".format(
                col=coluna,
                dt=now.strftime("%Y-%m-%d_%H-%M-%S")
            )
        )
    else:
        fig.show()
# ============================================================================================ #
def plot_cv(
        fh : int,
        df_merged : pd.DataFrame,
        df_resultado : pd.DataFrame,
        regressor : str,
        data_inicio : str,
        n_decimais : int,
        titulo_plot : str,
        pasta_dstn : str,
        salvar: bool = False,
) -> None:

    fig = go.Figure()

    fig = make_subplots(
        rows=2,
        cols=1,
        vertical_spacing=0.2,
        specs=[
            [{"type": "scatter"}],
            [{"type": "table"}]
        ],
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged[df_merged['ds'] >= data_inicio]["ds"],
            y=df_merged[df_merged['ds'] >= data_inicio]["y"],
            mode="lines+markers",
            name="Observado",
            line=dict(
                color="#000000",
                width=2
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged[df_merged['ds'] >= data_inicio]["ds"],
            y=df_merged[df_merged['ds'] >= data_inicio][regressor],
            mode="lines+markers",
            name="Previsão",
            line=dict(color="red"),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Table(
            header=dict(
                values=[
                    "MAPE",
                    "RMSE",
                    "PBIAS (%)",
                    "DRV",
                ],
                font=dict(size=18),
                align="center"
            ),
            cells=dict(
                values=[
                    round(mape(df_resultado.y, df_resultado[regressor]), n_decimais),
                    round(rmse(df_resultado.y, df_resultado[regressor]), n_decimais),
                    round(percentage_bias(df_resultado.y, df_resultado[regressor]), n_decimais),
                    round(deviation_runoff_volume(df_resultado.y, df_resultado[regressor]), n_decimais),
                ],
                font=dict(size=18),
                height=30,
                align="left",
            ),
        ),
        row=2,
        col=1,
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s)",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=1000,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text=titulo_plot,
            font=dict(
                family="system-ui",
                size=30
            )
        ),
    )

    if salvar:
        if not os.path.exists(pasta_dstn):
            os.makedirs(pasta_dstn)

        now = datetime.now()
        fig.write_image(
            pasta_dstn+"/cv_{reg}_fh{fh}_{dt}.png".format(
                reg=regressor,
                fh=fh,
                dt=now.strftime("%Y-%m-%d_%H-%M-%S")
            )
        )
    else:
        fig.show()
# ============================================================================================ #
def decomp_series(
    df: pd.DataFrame,
    tendencia: bool,
    sazonalidade: bool,
    residuo: bool,
    salvar: bool = False,
) -> None:
    # A decomposição das séries temporais ajuda a detectar padrões (tendência, sazonalidade)
    #   e identificar outras informações que podem ajudar na interpretação do que está acontecendo.

    # cols = df.drop(columns=["ds", "unique_id"]).columns.to_list()
    cols = df.drop(columns=["ds"]).columns.to_list()
    for c in cols:
        
        # Utilizei modelo do tipo "add" (aditivo) pois tem séries com valores 0 (zero).
        # Período de 365 dias porque o que me interessa é capturar comportamentos anuais.
        decomp = seasonal_decompose(
            df[c],
            period=365, # 365 dias = 1 ano
            model="add"
        )
        fig_decomp = make_subplots(specs=[[{"secondary_y": True}]])

        fig_decomp.add_trace(
            go.Scatter(
                x=df.ds,
                y=decomp.observed,
                name="observado",
                mode="lines",
                showlegend=True,
            ),
            secondary_y=False,
        )

        if tendencia:
            fig_decomp.add_trace(
                go.Scatter(
                    x=df.ds,
                    y=decomp.trend,
                    name="tendência",
                    mode="lines",
                    showlegend=True,
                ),
                secondary_y=True,
            )

        if sazonalidade:
            fig_decomp.add_trace(
                go.Scatter(
                    x=df.ds,
                    y=decomp.seasonal,
                    name="sazonalidade",
                    mode="lines",
                    showlegend=True,
                ),
                secondary_y=True,
            )

        if residuo:
            fig_decomp.add_trace(
                go.Scatter(
                    x=df.ds,
                    y=decomp.resid,
                    name="resíduo",
                    mode="lines",
                    showlegend=True,
                ),
                secondary_y=False,
            )

        fig_decomp.update_yaxes(
            title=dict(
                text="observado/resíduo",
                font=dict(family="system-ui", size=18)
            ),
            secondary_y=False,
            zerolinecolor="black",
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig_decomp.update_yaxes(
            title=dict(
                text="tendência/sazonalidade",
                font=dict(family="system-ui", size=18)
            ),
            secondary_y=True,
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig_decomp.update_xaxes(
            title=dict(
                text="Período",
                font=dict(family="system-ui", size=18)
            ),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig_decomp.update_layout(
            width=1500,
            height=700,
            plot_bgcolor="#c8d4e3",
            hovermode="x unified",
            title=dict(
                text="Decomposição da série temporal: {col}".format(col=c),
                font=dict(family="system-ui", size=24),
            ),
        )

        if salvar:
            fig_decomp.write_image(
                pasta_resultados+"aed/decomposicao_serie_{}.png".format(c)
            )
        else:
            fig_decomp.show()
# ============================================================================================ #
def estacionariedade(
    df: pd.DataFrame,
    sp: int
) -> None:

    # Avaliar a estacionariedade de cada uma das séries e a sazonalidade (se houver)
    # Existindo sazonalidade, qual a lag (ou quais lags) se encaixam nesta sazonalidade
    # cols = df.drop(columns=["ds", "unique_id"]).columns.to_list()
    cols = df.drop(columns=["ds"]).columns.to_list()
    for c in cols:
        ts = df[c]
        sty_est = StationarityADF()
        sty_est.fit(ts)
        print(c, sty_est.get_fitted_params()["stationary"])

        # Este teste de sazonalidade deve ser aplicado a séries estacionárias.
        # Se precisar tornar uma série em estacionária, tem de aplicar diferenciação antes.
        if sty_est.get_fitted_params()["stationary"]:
            sp_est = SeasonalityACF( # Minha intenção é ter certeza de que existe sazonalidade anual (365 dias)
                candidate_sp=sp,
                nlags=len(df[c])
            )
            sp_est.fit(ts)
            sp_est.get_fitted_params()
            print(c, sp_est.get_fitted_params()["sp_significant"])
# ============================================================================================ #
def mapa_correlacao(
    df: pd.DataFrame,
    medida: str = "pearson",
    salvar: bool = False
) -> None:

    if medida == "dtw":
        from dtaidistance import dtw

        dtw_dist = dtw.distance_matrix_fast(df.drop(columns=["ds"]).T.values)
        # dtw_dist = dtw.distance_matrix_fast(df.drop(columns=["ds", "unique_id"]).T.values)
        
        df_dtw_dist = pd.DataFrame(
            data=dtw_dist,
            # index=df.drop(columns=["ds", "unique_id"]).columns.to_list(),
            # columns=df.drop(columns=["ds", "unique_id"]).columns.to_list(),
            index=df.drop(columns=["ds"]).columns.to_list(),
            columns=df.drop(columns=["ds"]).columns.to_list(),
        )

        fig = go.Figure()

        fig.add_trace(
            go.Heatmap(
                x=df_dtw_dist.columns,
                y=df_dtw_dist.columns,
                z=df_dtw_dist,
                text=df_dtw_dist.values,
                texttemplate="%{text:.7f}",
                textfont={"size": 14},
                colorscale="rainbow",
                hovertemplate="%{y}<br>%{x}</br><extra></extra>",
            )
        )

        fig.update_yaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_xaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_layout(
            width=1500,
            height=700,
            title=dict(
                text="Mapa de correlação (DTW)",
                font=dict(family="system-ui", size=24)
            ),
        )

    elif medida == "pearson":

        # corr = df.drop(columns=["ds", "unique_id"]).corr()
        corr = df.drop(columns=["ds"]).corr()

        fig = go.Figure()

        fig.add_trace(
            go.Heatmap(
                x=corr.columns,
                y=corr.columns,
                z=corr,
                text=corr.values,
                texttemplate="%{text:.7f}",
                textfont={"size": 14},
                colorscale="rainbow",
                hovertemplate="%{y}<br>%{x}</br><extra></extra>",
            )
        )

        fig.update_yaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_xaxes(
            tickfont=dict(family="system-ui", size=14),
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_layout(
            width=1500,
            height=700,
            title=dict(
                text="Mapa de correlação (Pearson)",
                font=dict(family="system-ui", size=24),
            ),
        )

    else:
        raise Exception("Opção errada. ('dtw' ou 'pearson')")

    if salvar:
        fig.write_image(
            pasta_resultados+"aed/mapa_correlacao_{medida}.png".format(medida=medida)
        )
    else:
        fig.show()
# ============================================================================================ #
def plot_linha_tabela(
    df_merged: pd.DataFrame,
    regressor: str,
    plot_title: str,
    line_color: str,
    short_name: str,
    salvar: bool = False,
) -> None:

    fig = make_subplots(
        rows=2,
        cols=1,
        vertical_spacing=0.2,
        specs=[
            [{"type": "scatter"}],
            [{"type": "table"}]
        ],
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged["ds"],
            y=df_merged["y"],
            mode="lines+markers",
            name="observado",
            line=dict(
                color="#000000",
                width=2
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged["ds"],
            y=df_merged[regressor],
            mode="lines+markers",
            name=short_name,
            line=dict(color=line_color),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Table(
            header=dict(
                values=[
                    "MAPE",
                    "RMSE",
                    "PBIAS (%)",
                    "DRV",
                ],
                font=dict(size=14),
                align="center"
            ),
            cells=dict(
                values=[
                    mape(df_merged.y, df_merged[regressor]),
                    rmse(df_merged.y, df_merged[regressor]),
                    percentage_bias(df_merged.y, df_merged[regressor]),
                    deviation_runoff_volume(df_merged.y, df_merged[regressor]),
                ],
                font=dict(size=12),
                height=30,
                align="left",
            ),
        ),
        row=2,
        col=1,
    )

    fig.update_yaxes(
        title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(text="Período", font=dict(family="system-ui", size=18)),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=1000,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(text=plot_title, font=dict(family="system-ui", size=24)),
    )

    if salvar:
        now = datetime.now()
        fig.write_image(
            pasta_resultados+"{reg}_{dt}.png".format(reg=regressor, dt=now.strftime("%Y-%m-%d_%H-%M-%S"))
        )
    else:
        fig.show()
# ============================================================================================ #
def cria_plot_correlacao(
    serie: pd.Series,
    n_lags: int,
    plot_pacf: bool = False,
    salvar: bool = False
) -> None:

    corr_array = (
        pacf(serie.dropna(), nlags=n_lags, alpha=0.05)
        if plot_pacf
        else acf(serie.dropna(), nlags=n_lags, alpha=0.05)
    )

    lower_y = corr_array[1][:, 0] - corr_array[0]
    upper_y = corr_array[1][:, 1] - corr_array[0]

    fig = go.Figure()

    # Desenha as linhas verticais pretas
    [
        fig.add_scatter(
            x=(x, x),
            y=(0, corr_array[0][x]),
            mode="lines",
            line_color="black",
            hovertemplate="<extra></extra>",
        )
        for x in range(len(corr_array[0]))
    ]

    # Desenha as bolinhas vermelhas
    fig.add_scatter(
        x=np.arange(len(corr_array[0])),
        y=corr_array[0],
        mode="markers",
        marker_color="red",
        marker_size=12,
        hovertemplate="x = %{x}<br>y = %{y}<extra></extra>",
    )

    # Desenha a 'nuvem' clarinha acima do eixo x
    fig.add_scatter(
        x=np.arange(len(corr_array[0])),
        y=upper_y,
        mode="lines",
        line_color="rgba(255,255,255,0)",
        hovertemplate="<extra></extra>",
    )

    # Desenha a 'nuvem' clarinha abaixo do eixo x
    fig.add_scatter(
        x=np.arange(len(corr_array[0])),
        y=lower_y,
        mode="lines",
        fillcolor="rgba(32, 146, 230,0.3)",
        fill="tonexty",
        line_color="rgba(255,255,255,0)",
        hovertemplate="<extra></extra>",
    )

    fig.update_traces(showlegend=False)

    fig.update_xaxes(
        range=[-1, n_lags + 1],
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_yaxes(
        zerolinecolor="black",  # Quando 'y=0' a linha é preta
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    title = (
        "Autocorrelação Parcial (PACF) para n_lags={n}".format(n=n_lags)
        if plot_pacf
        else "Autocorrelação (ACF) para n_lags={n}".format(n=n_lags)
    )
    fig.update_layout(
        width=1500,
        height=700,
        plot_bgcolor="#c8d4e3",
        title=dict(text=title, font=dict(family="system-ui", size=24)),
    )

    if salvar:
        (
            fig.write_image(pasta_resultados+"aed/plot_pacf.png")
            if plot_pacf
            else fig.write_image(pasta_resultados+"aed/plot_acf.png")
        )
    else:
        fig.show()
# ============================================================================================ #
def cria_dataframe_futuro(
    df_futr: pd.DataFrame,
    df_train: pd.DataFrame,
    df_test: pd.DataFrame,
    tp_valor: str,
    n_lags: int,
    date_features: list,
    cols: list,
) -> pd.DataFrame:
    
    """
        tp_valor == "ultimo": # Usa o último valor conhecido
        tp_valor == "media":  # Usa o valor médio de cada coluna vazão
        tp_valor == "ml":     # Usa um modelo XGBoost para gerar previsão futuras das vazões auxiliares
    """

    if tp_valor == "ultimo":  # Usa o último valor conhecido
        for c in cols:
            df_futr[c] = df_train[c].iat[-1]

    elif tp_valor == "media":  # Usa o valor médio de cada coluna vazão
        for c in cols:
            df_futr[c] = df_train[c].mean()

    elif tp_valor == "ml":
        from xgboost import XGBRegressor

        for c in cols:
            fcst = mlf.MLForecast(
                models=XGBRegressor(seed=SEED),
                freq="D",
                lags=[i + 1 for i in range(n_lags)],
                date_features=date_features,
            )

            df_temp = df_train[["ds", "unique_id", c]]

            fcst.fit(
                df_temp,
                id_col="unique_id",
                time_col="ds",
                target_col=c,
                static_features=[],
            )

            df_preds = fcst.predict(h=len(df_futr)).reset_index()  # macetasso pra não dar erro de index
            df_futr[c] = df_preds["XGBRegressor"]

    else:
        raise Exception("Opção inválida! (ultimo | media | ml)")

    df_futr = pd.merge(
        left=df_futr,
        right=df_test.drop(columns=cols + ["y"]),
        on=["ds", "unique_id"],
        how="left",
    )

    return df_futr
# ============================================================================================ #
def distribuicao_dados(
    df_original: pd.DataFrame,
    df_1: pd.DataFrame,
    nome_1: str,
    df_2: pd.DataFrame,
    nome_2: str,
    salvar: bool = False,
) -> None:

    # cols = np.asarray(df_original.drop(columns=["ds", "unique_id"]).columns)
    cols = df_original.drop(columns=["ds"]).columns.to_list()

    for c in cols:
        fig = go.Figure()

        fig.add_trace(
            go.Box(
                y=df_original[c].values,
                name="original",
                marker_color="darkblue",
                jitter=0.5,
                pointpos=-2,
                boxpoints="all",
                boxmean="sd",
            )
        )

        fig.add_trace(
            go.Box(
                y=df_1[c].values,
                name=nome_1,
                marker_color="coral",
                jitter=0.5,
                pointpos=-2,
                boxpoints="all",
                boxmean="sd",
            )
        )

        fig.add_trace(
            go.Box(
                y=df_2[c].values,
                name=nome_2,
                marker_color="olive",
                jitter=0.5,
                pointpos=-2,
                boxpoints="all",
                boxmean="sd",
            )
        )

        fig.update_xaxes(
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_yaxes(
            zerolinecolor="black",
            mirror=True,
            ticks="outside",
            showline=True,
            linecolor="black",
        )

        fig.update_layout(
            width=1500,
            height=1000,
            plot_bgcolor="#c8d4e3",
            title=dict(
                text="Distribuição {c}".format(c=c),
                font=dict(family="system-ui", size=24),
            ),
        )

        if salvar:
            fig.write_image(
                pasta_resultados+"aed/distribuicao_dados_{}.png".format(c)
            )
        else:
            fig.show()
# ============================================================================================ #
def exportar_dict_json(
    v_dict: dict,
    pasta: str,
    nome_arq: str
) -> None:
    
    if not os.path.exists(pasta):
        os.makedirs(pasta)

    json_str = json.dumps(v_dict, indent=4)
    with open(pasta + nome_arq, "w") as a:
        a.write(json_str)
# ============================================================================================ #
def plot_divisao_treino_teste(
    df_treino: pd.DataFrame,
    df_teste: pd.DataFrame,
    col_data: str = "ds",
    col_plot: str = "y",
    salvar: bool = False,
) -> None:

    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df_treino[col_data],
            y=df_treino[col_plot],
            mode="lines",
            name="treino"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=df_teste[col_data],
            y=df_teste[col_plot],
            mode="lines",
            name="teste"
        )
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s) / Precipitação (mm/dia)",
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(family="system-ui", size=18)
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=700,
        hovermode="x unified",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text="Vazão 'y' (target)",
            font=dict(family="system-ui", size=24)
        ),
    )

    if salvar:
        fig.write_image(
            pasta_resultados+"aed/divisao_treino_teste_{c}.png".format(c=col_plot)
        )
    else:
        fig.show()
# ============================================================================================ #
def plot_resultados(
    df_merged : pd.DataFrame,
    modelo : str,
    nome_curto : str,
    fh : int,
    titulo : str,
    pasta_dstn : str,
    niveis : list = None,
    cores : list = None,
    salvar : bool = False,
    n_decimal : int = 5,
    metricas : str = "padrao",
    marcadores : bool = True,
    indice_ds : bool = False
) -> None:
    """
        df_merged:
            O DataFrame com todas as previsões, os valores observados e os quartis (se houver)

        modelo:
            String com o nome do modelo que será referenciado no DataFrame (df_merged[modelo])

        nome_curto:
            Nome curto do modelo para colocar na legenda do gráfico

        fh:
            Horizonte de previsão para colocar no título do gráfico

        titulo:
            O título do gráfico

        pasta_dstn:
            Pasta onde salvar a imagem

        niveis:
            Array, em ordem do maior para o menor, com os níveis dos quartis

        cores:
            Cores para aplicar ao plotar os quartis no gráfico

        salvar:
            Se vai salvar diretamente para um arquivo ou se vai renderizar em tela o gráfico

        n_decimal:
            Para arredondar as casas decimais dos números na tabela com as métricas

        metricas:
            Se vai gerar a tabela com as métricas resumidas com as métricas padrão ou as mais comumente usadas em Hidrologia

        marcadores:
            Se as linhas dos gráficos serão lisas ou com marcadores
    """

    mtrcs = {}
    if metricas == "padrao":
        mtrcs[modelo] = {
            "MAPE": mape(df_merged["y"], df_merged[modelo]),
            "RMSE": rmse(df_merged["y"], df_merged[modelo]),
            "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo]),
            # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo]),
        }

        if niveis is not None:
            for n in niveis:
                mtrcs[modelo+"-lo-"+n] = {
                    "MAPE": mape(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "RMSE": rmse(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                }

                mtrcs[modelo+"-hi-"+n] = {
                    "MAPE": mape(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "RMSE": rmse(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                }
    elif metricas == "hidrologia":
        mtrcs[modelo] = {
            "MAPE": mape(df_merged["y"], df_merged[modelo]),
            "KGEnp": kling_gupta_efficiency_non_parametric(df_merged["y"], df_merged[modelo]),
            "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo]),
            # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo]),
        }

        if niveis is not None:
            for n in niveis:
                mtrcs[modelo+"-lo-"+n] = {
                    "MAPE": mape(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "KGEnp": kling_gupta_efficiency_non_parametric(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-lo-"+n]),
                }

                mtrcs[modelo+"-hi-"+n] = {
                    "MAPE": mape(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "KGEnp": kling_gupta_efficiency_non_parametric(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    "PBIAS (%)": percentage_bias(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                    # "DRV": deviation_runoff_volume(df_merged["y"], df_merged[modelo+"-hi-"+n]),
                }
    else:
        raise Exception("Opção inválida para métrica! ('padrao' | 'hidrologia')")

    df_tbl = pd.DataFrame(mtrcs).T.reset_index(names="Modelo").round(n_decimal) # Arredondando para "n_decimal" casas decimais

    fig = make_subplots(
        rows=2,
        cols=1,
        vertical_spacing=0.2,
        specs=[
            [{"type": "scatter"}],
            [{"type": "table"}]
        ],
    )

    if niveis is not None and cores is not None:
        for n, c in zip(niveis, cores):
            fig.add_trace(
                go.Scatter(
                    x=df_merged.index if indice_ds else df_merged["ds"],
                    y=df_merged[modelo+"-hi-"+n],
                    mode="lines+markers" if marcadores else "lines",
                    name=nome_curto+"-hi-"+n,
                    line=dict(color=c),
                ),
                row=1,
                col=1,
            )

            fig.add_trace(
                go.Scatter(
                    x=df_merged.index if indice_ds else df_merged["ds"],
                    y=df_merged[modelo+"-lo-"+n],
                    mode="lines+markers" if marcadores else "lines",
                    name=nome_curto+"-lo-"+n,
                    fill="tonexty",
                    line=dict(color=c),
                ),
                row=1,
                col=1,
            )

    fig.add_trace(
        go.Scatter(
            x=df_merged.index if indice_ds else df_merged["ds"],
            y=df_merged[modelo],
            mode="lines+markers" if marcadores else "lines",
            name=nome_curto,
            line=dict(
                color="magenta",
                width=4
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Scatter(
            x=df_merged.index if indice_ds else df_merged["ds"],
            y=df_merged["y"],
            mode="lines+markers" if marcadores else "lines",
            name="observado",
            line=dict(
                color="black",
                width=2
            ),
        ),
        row=1,
        col=1,
    )

    fig.add_trace(
        go.Table(
            header=dict(
                values=df_tbl.columns.to_list(),
                font=dict(size=18),
                align="center"
            ),
            cells=dict(
                values=df_tbl.T,
                font=dict(size=18),
                height=30,
                align="left"
            ),
        ),
        row=2,
        col=1,
    )

    fig.update_yaxes(
        title=dict(
            text="Vazão (m³/s)",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text="Período",
            font=dict(
                family="system-ui",
                size=22
            )
        ),
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_traces(
        hovertemplate=None,
        row=1,
        col=1
    )

    fig.update_layout(
        width=1500,
        height=1000,
        plot_bgcolor="#c8d4e3",
        hovermode="x unified",
        title=dict(
            text=titulo,
            font=dict(
                family="system-ui",
                size=30
            ),
        ),
    )

    if salvar:
        if not os.path.exists(pasta_dstn):
            os.makedirs(pasta_dstn)

        now = datetime.now()
        fig.write_image(
            pasta_dstn+"/{md}_fh{fh}_{dt}.png".format(
                fh=fh,
                md=modelo,
                dt=now.strftime("%Y-%m-%d_%H-%M-%S")
            )
        )
    else:
        fig.show()
# ============================================================================================ #
def determinar_estacao(data, estacao):
    # Substituir 29 de fevereiro por 28 de fevereiro
    if data.month == 2 and data.day == 29:
        data = data.replace(day=28)

    data = pd.Timestamp(data.strftime('%m-%d'))  # Considerar apenas o mês e o dia
    for estacao, periodos in estacoes.items():
        for inicio, fim in periodos:
            if inicio <= data <= fim or (inicio > fim and (data >= inicio or data <= fim)):
                return estacao_para_numero[estacao]
    return 'desconhecida'
# ============================================================================================ #
# Função principal para adicionar colunas ao DataFrame
def adicionar_estacao(
        df : pd.DataFrame,
        coluna_data : str
    ) -> pd.DataFrame:
    
    df['estacao'] = df[coluna_data].apply(lambda x: determinar_estacao(x, estacoes))
    return df
# ============================================================================================ #
def gerar_atributos_data(
    df : pd.DataFrame,
    atributos : list[str],
    col_data : str = "ds",
) -> pd.DataFrame:

  df_result = df.copy()
  if atributos is not None:
    for a in atributos:
      if a in ("week", "weekofyear"):
        w = pd.to_datetime(df_result[col_data]).dt.isocalendar()
        df_result[a] = getattr(w, a)
      elif a == "estacao":
        df_result = adicionar_estacao(df_result, col_data)
      else:
        df_result[a] = getattr(pd.to_datetime(df_result[col_data]).dt, a)

  return df_result
# ============================================================================================ #
# Preencher com dados do ano anterior (max_years=1) ou média dos anos anteriores (max_years>1)
def fill_with_previous_years_average(
        df : pd.DataFrame,
        columns : list[str],
        date_col : str = 'ds',
        max_years : int = 3
    ) -> pd.DataFrame :

    df_filled = df.copy()
    for col in columns:
        for i in range(len(df)):
            if pd.isnull(df_filled.loc[i, col]):
                sum_values = 0
                count_values = 0
                for year in range(1, max_years + 1):
                    previous_year_date = df_filled.loc[i, date_col] - pd.DateOffset(years=year)
                    previous_year_value = df_filled.loc[df_filled[date_col] == previous_year_date, col]
                    if not previous_year_value.empty:
                        sum_values += previous_year_value.values[0]
                        count_values += 1
                if count_values > 0:
                    df_filled.loc[i, col] = sum_values / count_values
    return df_filled

# Carregando e imputando dados

In [4]:
df = pd.read_excel(
    io="./arquivos_finais/baixo_rio_jequitinhonha_final.xlsx",
    sheet_name=0,
    index_col=0,
    header=0,
    parse_dates=["Data"],
)

In [5]:
df

Unnamed: 0_level_0,c_cv_01640000,t_cv_54790000,y,c_vz_54780000
Data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-01,0.6,,,187.7730
2013-01-02,0.0,,,105.3130
2013-01-03,13.2,,,101.8800
2013-01-04,0.0,,,98.5206
2013-01-05,0.0,,,98.5206
...,...,...,...,...
2023-12-27,0.0,0.0,211.450000,442.6400
2023-12-28,0.0,0.0,167.420833,131.6670
2023-12-29,0.0,0.6,117.235417,121.8540
2023-12-30,0.0,0.0,94.239583,93.7937


In [6]:
# df = df.asfreq("D")
# df.index.name = "ds"
# df

In [7]:
# df["unique_id"] = 1
df = df.reset_index()
df = df.rename(columns={
    "Data": "ds",
    # "c_vz_56994500": "y"
})

df

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y,c_vz_54780000
0,2013-01-01,0.6,,,187.7730
1,2013-01-02,0.0,,,105.3130
2,2013-01-03,13.2,,,101.8800
3,2013-01-04,0.0,,,98.5206
4,2013-01-05,0.0,,,98.5206
...,...,...,...,...,...
4012,2023-12-27,0.0,0.0,211.450000,442.6400
4013,2023-12-28,0.0,0.0,167.420833,131.6670
4014,2023-12-29,0.0,0.6,117.235417,121.8540
4015,2023-12-30,0.0,0.0,94.239583,93.7937


In [8]:
# df.columns

In [9]:
# Rearranjando a posição das colunas pra facilitar a leitura
# O escopo do trabalho é aplicar dados de chuva e atributos categóricos na previsão de vazão.

df = df[[
    'ds',
    'c_cv_01640000',
    't_cv_54790000',
    'y'
]]

df

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y
0,2013-01-01,0.6,,
1,2013-01-02,0.0,,
2,2013-01-03,13.2,,
3,2013-01-04,0.0,,
4,2013-01-05,0.0,,
...,...,...,...,...
4012,2023-12-27,0.0,0.0,211.450000
4013,2023-12-28,0.0,0.0,167.420833
4014,2023-12-29,0.0,0.6,117.235417
4015,2023-12-30,0.0,0.0,94.239583


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4017 entries, 0 to 4016
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   ds             4017 non-null   datetime64[ns]
 1   c_cv_01640000  4017 non-null   float64       
 2   t_cv_54790000  3744 non-null   float64       
 3   y              3485 non-null   float64       
dtypes: datetime64[ns](1), float64(3)
memory usage: 125.7 KB


In [11]:
# Verificando dados faltantes

print("Quantidade de dados faltantes:\n{}".format(df[df.select_dtypes(include=['float64', 'int64']).columns].isna().sum()))
print("---")
print("Percentual de dados faltantes:\n{}".format((100 * df[df.select_dtypes(include=['float64', 'int64']).columns].isna().sum()) / len(df)))

Quantidade de dados faltantes:
c_cv_01640000      0
t_cv_54790000    273
y                532
dtype: int64
---
Percentual de dados faltantes:
c_cv_01640000     0.000000
t_cv_54790000     6.796117
y                13.243714
dtype: float64


## Preenchendo com valores observados anteriores

In [12]:
# Preenchendo com dados médios dos últimos 3 anos
num_cols = df.select_dtypes(include=['float64', 'int64']).columns
df_sazonal = fill_with_previous_years_average(df, num_cols)
df_sazonal

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y
0,2013-01-01,0.6,,
1,2013-01-02,0.0,,
2,2013-01-03,13.2,,
3,2013-01-04,0.0,,
4,2013-01-05,0.0,,
...,...,...,...,...
4012,2023-12-27,0.0,0.0,211.450000
4013,2023-12-28,0.0,0.0,167.420833
4014,2023-12-29,0.0,0.6,117.235417
4015,2023-12-30,0.0,0.0,94.239583


In [13]:
# Verificando dados faltantes

print("Quantidade de dados faltantes:\n{}".format(
    df_sazonal[df_sazonal.select_dtypes(include=['float64', 'int64']).columns].isna().sum()
))
print("---")
print("Percentual de dados faltantes:\n{}".format(
    (100 * df_sazonal[df_sazonal.select_dtypes(include=['float64', 'int64']).columns].isna().sum()) / len(df_sazonal)
))

Quantidade de dados faltantes:
c_cv_01640000      0
t_cv_54790000    273
y                426
dtype: int64
---
Percentual de dados faltantes:
c_cv_01640000     0.000000
t_cv_54790000     6.796117
y                10.604929
dtype: float64


## Preenchendo com o KNNImputer

In [14]:
# Ainda precisa preencher dados faltates. Vou aplicar o KNNImputer pra finalizar isso.

imputer = KNNImputer(
    n_neighbors=7,
    weights="distance" # vizinhos mais próximos têm mais influência
)

df_knn = pd.DataFrame(
    data=imputer.fit_transform(df_sazonal.drop(columns=["ds"])),
    columns=df_sazonal.drop(columns=["ds"]).columns,
)

df_knn = pd.DataFrame(
    data=df_knn,
    columns=df_sazonal.drop(columns=["ds"]).columns
)

df_knn = pd.concat(
    [df_sazonal[["ds"]], df_knn],
    axis=1
)

df_knn

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y
0,2013-01-01,0.6,5.850000,206.873810
1,2013-01-02,0.0,0.142857,158.487202
2,2013-01-03,13.2,0.000000,0.000000
3,2013-01-04,0.0,0.142857,158.487202
4,2013-01-05,0.0,0.142857,158.487202
...,...,...,...,...
4012,2023-12-27,0.0,0.000000,211.450000
4013,2023-12-28,0.0,0.000000,167.420833
4014,2023-12-29,0.0,0.600000,117.235417
4015,2023-12-30,0.0,0.000000,94.239583


In [15]:
# Quantos zeros existem nas colunas
target = ['y']

# vazoes = [
#     't_vz_56990850',
#     't_vz_56990005',
#     'c_vz_56989400',
#     'c_vz_56989900',
#     'c_vz_56990000'
# ]

chuvas = [
    'c_cv_01640000',
    't_cv_54790000',
]

print("Target")
print(target[0], (df_knn[target[0]] == 0).sum())

# print("Vazões")
# for v in vazoes:
#     print(v, (df_knn[v] == 0).sum())
# Uma coluna de vazão tem 52 dias com 0's (zeros). O rio secou???? O.o

print("Chuvas")
for c in chuvas:
    print(c, (df_knn[c] == 0).sum())
# Tem zero pra caramba nas colunas de chuva. =/

Target
y 326
Chuvas
c_cv_01640000 3151
t_cv_54790000 2660


In [16]:
# distribuicao_dados(
#     df_original=df,
#     df_1=df_sazonal,
#     nome_1="sazonal",
#     df_2=df_knn,
#     nome_2="KNN",
#     salvar=SALVAR_PLOTS
# )

# Análise Exploratória dos Dados

In [17]:
# df_knn.columns

In [18]:
# Adicionando atributos categóricos de Data

atributos_categoricos = ['estacao', 'month', 'dayofyear', 'week', 'quarter']
df_knn = gerar_atributos_data(
    df=df_knn,
    atributos=atributos_categoricos,
    col_data="ds"
)

df_knn

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y,estacao,month,dayofyear,week,quarter
0,2013-01-01,0.6,5.850000,206.873810,1,1,1,1,1
1,2013-01-02,0.0,0.142857,158.487202,1,1,2,1,1
2,2013-01-03,13.2,0.000000,0.000000,1,1,3,1,1
3,2013-01-04,0.0,0.142857,158.487202,1,1,4,1,1
4,2013-01-05,0.0,0.142857,158.487202,1,1,5,1,1
...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,0.0,0.000000,211.450000,1,12,361,52,4
4013,2023-12-28,0.0,0.000000,167.420833,1,12,362,52,4
4014,2023-12-29,0.0,0.600000,117.235417,1,12,363,52,4
4015,2023-12-30,0.0,0.000000,94.239583,1,12,364,52,4


In [19]:
# df_X = df_knn.drop(columns=['y'], axis=1)
# df_y = df_knn[['ds', 'y', 'estacao', 'month', 'dayofyear', 'week', 'quarter']]

In [20]:
# df_X

In [21]:
# df_y

In [22]:
# Dados de chuva têm característica "cauda longa"

fig = px.histogram(
    df_knn[chuvas],
    marginal="box",
    barmode='overlay',
    opacity=0.75,
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.show()

In [23]:
# Tem uma coluna de chuva estranhamente elevada. Vou remover.
# df_knn = df_knn.drop(columns=["t_cv_56990850"]).copy()

In [24]:
# Atualiza a lista de colunas de chuva

# chuvas = [
#     'c_cv_01941010',
#     'c_cv_01941004',
#     'c_cv_01941006',
#     't_cv_56994500',
#     't_cv_56990005'
# ]

In [25]:
# fig = px.histogram(
#     df_knn[chuvas],
#     marginal="box",
#     barmode='overlay',
#     opacity=0.75,
#     color_discrete_sequence=px.colors.qualitative.T10
# )
# fig.show()

In [26]:
# Ajustando a cauda longa aplicando a Transformação Logarítmica

fig = px.histogram(
    np.log1p(df_knn[chuvas]),
    marginal="box",
    barmode='overlay',
    opacity=0.75,
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.show()

# Parece menos pior...

In [27]:
# # Fazendo a mesma avaliação para as colunas de vazão

# fig = px.histogram(
#     df_knn[vazoes],
#     marginal="box",
#     barmode='overlay',
#     opacity=0.75,
#     color_discrete_sequence=px.colors.qualitative.T10
# )
# fig.show()

In [28]:
# # Tem uma coluna de vazão com valor elevadíssimo. É da "UHE Aimores Barramento". Vou remover.
# df_knn = df_knn.drop(columns=["t_vz_56990850"]).copy()

In [29]:
# # Atualiza a lista de colunas de vazão

# vazoes = [
#     't_vz_56990005',
#     'c_vz_56989400',
#     'c_vz_56989900',
#     'c_vz_56990000'
# ]

In [30]:
# # Pelo que parece, os dados de vazão também têm cauda longa

# fig = px.histogram(
#     df_knn[vazoes],
#     marginal="box",
#     barmode='overlay',
#     opacity=0.75,
#     color_discrete_sequence=px.colors.qualitative.T10
# )
# fig.show()

In [31]:
# # Ajustando a cauda longa aplicando a Transformação Logarítmica

# fig = px.histogram(
#     np.log1p(df_knn[vazoes]),
#     marginal="box",
#     barmode='overlay',
#     opacity=0.75,
#     color_discrete_sequence=px.colors.qualitative.T10
# )
# fig.show()

# # Parece menos pior também...

In [32]:
# Fazendo a análise para a coluna "target"
fig = px.histogram(
    df_knn[target],
    marginal="box",
    barmode='overlay',
    opacity=0.75,
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.show()

In [33]:
# Também tem cauda longa.

fig = px.histogram(
    np.log1p(df_knn[target]),
    marginal="box",
    barmode='overlay',
    opacity=0.75,
    color_discrete_sequence=px.colors.qualitative.T10
)
fig.show()

# Melhorou...

Com os dados visualizados e o DataFrame contendo apenas as colunas que serão empregadas no trabalho, partimos para analisar o comportamento das séries temporais.

Sobre as transformações logarítmicas, vou usá-las quando for trabalhar os modelos. Mesmo que se aplique um scaling, a característica de cauda longa se mantém.

## Séries Temporais

In [34]:
df_knn.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4017 entries, 0 to 4016
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   ds             4017 non-null   datetime64[ns]
 1   c_cv_01640000  4017 non-null   float64       
 2   t_cv_54790000  4017 non-null   float64       
 3   y              4017 non-null   float64       
 4   estacao        4017 non-null   int64         
 5   month          4017 non-null   int32         
 6   dayofyear      4017 non-null   int32         
 7   week           4017 non-null   UInt32        
 8   quarter        4017 non-null   int32         
dtypes: UInt32(1), datetime64[ns](1), float64(3), int32(3), int64(1)
memory usage: 223.7 KB


In [35]:
for c in chuvas:
    plot_serie_temporal(
        dataset=df_knn,
        coluna=c,
        tp_coluna="chuva",
        plot_title="Série Temporal completa: {}".format(c),
        line_color="darkgreen",
        short_name=c,
        pasta_resultados=pasta_resultados+"aed/",
        salvar=SALVAR_PLOTS
    )

In [36]:
# Remover a coluna de chuva "t_cv_56994500". Tem um hiato longo de dados zerados.
# Tem uma coluna de chuva estranhamente elevada. Vou remover.
# df_knn = df_knn.drop(columns=["t_cv_56994500"]).copy()

In [37]:
# chuvas = [
#     'c_cv_01941010',
#     'c_cv_01941004',
#     'c_cv_01941006',
#     't_cv_56990005'
# ]

In [38]:
# for v in vazoes:
#     plot_serie_temporal(
#         dataset=df_knn,
#         coluna=v,
#         tp_coluna="vazao",
#         plot_title="Série Temporal completa: {}".format(v),
#         line_color="darkblue",
#         short_name=v,
#         salvar=SALVAR_PLOTS
#     )

In [39]:
for t in target:
    plot_serie_temporal(
        dataset=df_knn,
        coluna=t,
        tp_coluna="vazao",
        plot_title="Série Temporal completa: {}".format(t),
        line_color="darkred",
        pasta_resultados=pasta_resultados+"aed/",
        short_name=t,
        salvar=SALVAR_PLOTS
    )

O bom é que os valores zero ficaram no início da série temporal. Vai impactar pouco no estudo.

## Decomposição das Séries Temporais

A decomposição das séries temporais ajuda a detectar padrões (tendência, sazonalidade) e identificar outras informações que podem ajudar na interpretação do que está acontecendo.

In [40]:
decomp_series(
    df=df_knn.drop(columns=atributos_categoricos),
    tendencia=True,
    sazonalidade=False,
    residuo=False,
    salvar=SALVAR_PLOTS
)

## Estacionariedade

In [41]:
estacionariedade(
    df=df_knn.drop(columns=atributos_categoricos),
    sp=365
)

c_cv_01640000 True
c_cv_01640000 []
t_cv_54790000 True
t_cv_54790000 [365]
y True
y [365]


Excelente, todas as séries são estacionárias.

A coluna "y" e "t_cv_54790000" apresentaram sazonalidade no dia 365.

## Correlação entre as séries

In [42]:
mapa_correlacao(
    df=df_knn,
    medida="pearson",
    salvar=SALVAR_PLOTS
)

In [43]:
# Usando o sweetviz para avaliar
# import sweetviz as sv
# analyze_report = sv.analyze(df_knn)
# analyze_report.show_html('analyze.html', open_browser=True)

# Apresentando os resultados (serve apenas para usar no Google Colab)
# import IPython
# IPython.display.HTML('analyze.html')

In [44]:
# Preferi jogar os dados alterados para um novo DataFrame porque se precisar voltar no DataFrame inicial, não precisará regarregar o arquivo
df_aux = df_knn.copy()

## Análise de Autocorrelação

In [45]:
# Me interessa saber a sazonalidade da variável-alvo, a vazão
cria_plot_correlacao(
    serie=df_aux[target],
    n_lags=500,
    plot_pacf=False,
    salvar=SALVAR_PLOTS
)

# Na lag 365 o gráfico volta a descer.
# Isso nos dá uma visão da sazonalidade da série, que é de 365 dias

In [46]:
cria_plot_correlacao(
    serie=df_aux[target],
    n_lags=15,
    plot_pacf=True,
    salvar=SALVAR_PLOTS
)

## Relação entre as variáveis

In [47]:
# for v in vazoes:
#     fig = go.Figure()

#     fig.add_trace(
#         go.Scatter(
#             x=df_aux[v],
#             y=df_aux["y"],
#             mode="markers",
#             line=dict(color="blue"),
#             hovertemplate="eixo_x: %{x}<br>eixo_y: %{y}</br><extra></extra>",
#             showlegend=False,
#         )
#     )

#     fig.update_xaxes(
#         title=dict(
#             text=df_aux[v].name, 
#             font=dict(family="system-ui", size=18)
#         ),
#         zerolinecolor="black",
#         showspikes=True,
#         mirror=True,
#         ticks="outside",
#         showline=True,
#         linecolor="black",
#     )

#     fig.update_yaxes(
#         title=dict(
#             text=df_aux["y"].name,
#             font=dict(family="system-ui", size=18)
#         ),
#         zerolinecolor="black",
#         showspikes=True,
#         mirror=True,
#         ticks="outside",
#         showline=True,
#         linecolor="black",
#     )

#     fig.update_layout(
#         width=1500,
#         height=700,
#         hovermode="closest",
#         plot_bgcolor="#c8d4e3",
#         title=dict(
#             text="Relação entre as variáveis 'y' e '{v}'".format(v=v),
#             font=dict(family="system-ui", size=24),
#         ),
#     )

#     if SALVAR_PLOTS:
#         fig.write_image(
#             pasta_resultados+"aed/relacao_y_{v}.png".format(v=v)
#         )
#     else:
#         fig.show()

# ============================================================================ #

for c in chuvas:
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=df_aux[c],
            y=df_aux[target[0]],
            mode="markers",
            line=dict(color="green"),
            hovertemplate="eixo_x: %{x}<br>eixo_y: %{y}</br><extra></extra>",
            showlegend=False,
        )
    )

    fig.update_yaxes(
        title=dict(
            text=df_aux[target[0]].name,
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        showspikes=True,
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_xaxes(
        title=dict(
            text=df_aux[c].name,
            font=dict(family="system-ui", size=18)
        ),
        zerolinecolor="black",
        showspikes=True,
        mirror=True,
        ticks="outside",
        showline=True,
        linecolor="black",
    )

    fig.update_layout(
        width=1500,
        height=700,
        hovermode="closest",
        plot_bgcolor="#c8d4e3",
        title=dict(
            text="Relação entre as variáveis {t} e '{c}'".format(t=target[0], c=c),
            font=dict(family="system-ui", size=24),
        ),
    )

    if SALVAR_PLOTS:
        fig.write_image(
            pasta_resultados+"aed/relacao_{t}_{c}.png".format(t=target[0], c=c)
        )
    else:
        fig.show()

## Análise de delay

In [48]:
# import matplotlib.pyplot as plt
# from dtaidistance import dtw
# from dtaidistance import dtw_visualisation as dtw_vis

# dtw_dist = dtw.distance_matrix_fast(df_aux.drop(columns=["ds", "unique_id"]).T.values)

# df_dtw_dist = pd.DataFrame(
#     data=dtw_dist,
#     index=df_aux.drop(columns=["ds", "unique_id"]).columns.to_list(),
#     columns=df_aux.drop(columns=["ds", "unique_id"]).columns.to_list(),
# )

# fig, axs = plt.subplots(
#     nrows=2,
#     ncols=1,
#     figsize=(2560 / 96, 1440 / 96),
# )

# path = dtw.warping_path(
#     from_s=df_aux["t_vz_56990850"].tail(60).T.values,
#     to_s=df_aux["y"].tail(60).T.values,
# )

# dtw_vis.plot_warping(
#     s1=df_aux["t_vz_56990850"].tail(60).T.values,
#     s2=df_aux["y"].tail(60).T.values,
#     path=path,
#     fig=fig,
#     axs=axs,
#     series_line_options={
#         "linewidth": 3.0,
#         "color": "blue",
#         "alpha": 0.5
#     },
#     warping_line_options={
#         "linewidth": 1.0,
#         "color": "red",
#         "alpha": 1.0
#     },
# )

# axs[1].set_xlabel("Lags")
# axs[0].set_ylabel("Vazão ($m^3$/s) - t_vz_56990850")
# axs[1].set_ylabel("Vazão ($m^3$/s) - y")
# fig.show()

## Granger-causality

In [49]:
# from statsmodels.tsa.stattools import grangercausalitytests

# # vazoes = ["t_vz_56990850", "t_vz_56990005", "c_vz_56989400", "c_vz_56989900", "c_vz_56990000"]
# # chuvas = ["c_cv_01941010", "c_cv_01941004", "c_cv_01941006", "t_cv_56990005"]

# df_granger = pd.DataFrame()
# df_granger = df_aux.drop(columns=["ds", "unique_id"]).diff(1)  # aplica essa diferenciação pra remover qq efeito de tendência
# df_granger = df_granger.dropna()

# grangercausalitytests(
#     x=df_granger[["y", "t_vz_56990850"]].tail(30),
#     maxlag=7,
#     verbose=True
# )

# Variáveis globais

In [50]:
look_back = 7 # Lags a serem utilizadas. Uma semana passada.

n_folds = 5

fh_v = [1, 3, 7, 15]  # Horizonte de Previsão (como a frequência dos dados é diária, isso significa "fch" dias)

fh_artigo = [1, 3, 7]  # Horizonte de Previsão inspirado no artigo da Alemanha

intervalos_previsao = [90]
# No gráfico será mostrado apenas os níveis que estiverem aqui.
# Deve ser posto na ordem inversa, ou seja, do maior pro menor nível.
# intervalos_previsao_plotar = ["95", "80"]
intervalos_previsao_plotar = ["90"]

colunas_chuva = chuvas
# colunas_vazao = vazoes

# Cenários de experimentação
cenario1 = "sem_chuva"
cenario2 = "com_chuva"

In [91]:
df_aux_log = df_aux.copy()
for c in chuvas:
    df_aux_log[c] = np.log1p(df_aux_log[c])

# for v in vazoes:
#     df_aux_log[v] = np.log1p(df_aux_log[v])

df_aux_log[target] = np.log1p(df_aux_log[target])

df_aux_log = df_aux_log.set_index(pd.to_datetime(df_aux_log['ds'])).drop(columns=['ds']).copy()
df_aux_log = df_aux_log.asfreq("D")
df_aux_log

Unnamed: 0_level_0,c_cv_01640000,t_cv_54790000,y,estacao,month,dayofyear,week,quarter
ds,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2013-01-01,0.470004,1.924249,5.336931,1,1,1,1,1
2013-01-02,0.000000,0.133531,5.071964,1,1,2,1,1
2013-01-03,2.653242,0.000000,0.000000,1,1,3,1,1
2013-01-04,0.000000,0.133531,5.071964,1,1,4,1,1
2013-01-05,0.000000,0.133531,5.071964,1,1,5,1,1
...,...,...,...,...,...,...,...,...
2023-12-27,0.000000,0.000000,5.358707,1,12,361,52,4
2023-12-28,0.000000,0.000000,5.126466,1,12,362,52,4
2023-12-29,0.000000,0.470004,4.772678,1,12,363,52,4
2023-12-30,0.000000,0.000000,4.556396,1,12,364,52,4


# Separação dos dados

In [51]:
# Os dados do ano de 2023 serão um DataFrame especial

df_2023 = df_aux[pd.to_datetime(df_aux['ds']).dt.year == 2023].copy()
df_2023

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y,estacao,month,dayofyear,week,quarter
3652,2023-01-01,0.0,0.4,872.347917,1,1,1,52,1
3653,2023-01-02,30.0,21.4,653.010417,1,1,2,1,1
3654,2023-01-03,0.0,0.2,505.116667,1,1,3,1,1
3655,2023-01-04,30.0,6.0,462.472917,1,1,4,1,1
3656,2023-01-05,0.0,1.8,464.568750,1,1,5,1,1
...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,0.0,0.0,211.450000,1,12,361,52,4
4013,2023-12-28,0.0,0.0,167.420833,1,12,362,52,4
4014,2023-12-29,0.0,0.6,117.235417,1,12,363,52,4
4015,2023-12-30,0.0,0.0,94.239583,1,12,364,52,4


In [52]:
# Vou deixar um DataFrame já com a Transformação Logarítmica

df_2023_log = df_2023.copy()
for c in chuvas:
    df_2023_log[c] = np.log1p(df_2023_log[c])

# for v in vazoes:
#     df_2023_log[v] = np.log1p(df_2023_log[v])

df_2023_log[target] = np.log1p(df_2023_log[target])

df_2023_log

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y,estacao,month,dayofyear,week,quarter
3652,2023-01-01,0.000000,0.336472,6.772334,1,1,1,52,1
3653,2023-01-02,3.433987,3.109061,6.483123,1,1,2,1,1
3654,2023-01-03,0.000000,0.182322,6.226767,1,1,3,1,1
3655,2023-01-04,3.433987,1.945910,6.138748,1,1,4,1,1
3656,2023-01-05,0.000000,1.029619,6.143260,1,1,5,1,1
...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,0.000000,0.000000,5.358707,1,12,361,52,4
4013,2023-12-28,0.000000,0.000000,5.126466,1,12,362,52,4
4014,2023-12-29,0.000000,0.470004,4.772678,1,12,363,52,4
4015,2023-12-30,0.000000,0.000000,4.556396,1,12,364,52,4


In [53]:
df_dados = df_aux.drop(index=df_2023.index).copy()
df_dados

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y,estacao,month,dayofyear,week,quarter
0,2013-01-01,0.6,5.850000,206.873810,1,1,1,1,1
1,2013-01-02,0.0,0.142857,158.487202,1,1,2,1,1
2,2013-01-03,13.2,0.000000,0.000000,1,1,3,1,1
3,2013-01-04,0.0,0.142857,158.487202,1,1,4,1,1
4,2013-01-05,0.0,0.142857,158.487202,1,1,5,1,1
...,...,...,...,...,...,...,...,...,...
3647,2022-12-27,0.0,0.000000,1666.347917,1,12,361,52,4
3648,2022-12-28,0.0,0.000000,1307.579167,1,12,362,52,4
3649,2022-12-29,0.0,0.000000,1098.075000,1,12,363,52,4
3650,2022-12-30,0.0,0.000000,1057.764583,1,12,364,52,4


In [54]:
# Vou deixar um DataFrame já com a Transformação Logarítmica

df_dados_log = df_dados.copy()
for c in chuvas:
    df_dados_log[c] = np.log1p(df_dados_log[c])

# for v in vazoes:
#     df_dados_log[v] = np.log1p(df_dados_log[v])

df_dados_log[target] = np.log1p(df_dados_log[target])

df_dados_log

Unnamed: 0,ds,c_cv_01640000,t_cv_54790000,y,estacao,month,dayofyear,week,quarter
0,2013-01-01,0.470004,1.924249,5.336931,1,1,1,1,1
1,2013-01-02,0.000000,0.133531,5.071964,1,1,2,1,1
2,2013-01-03,2.653242,0.000000,0.000000,1,1,3,1,1
3,2013-01-04,0.000000,0.133531,5.071964,1,1,4,1,1
4,2013-01-05,0.000000,0.133531,5.071964,1,1,5,1,1
...,...,...,...,...,...,...,...,...,...
3647,2022-12-27,0.000000,0.000000,7.418990,1,12,361,52,4
3648,2022-12-28,0.000000,0.000000,7.176697,1,12,362,52,4
3649,2022-12-29,0.000000,0.000000,7.002224,1,12,363,52,4
3650,2022-12-30,0.000000,0.000000,6.964858,1,12,364,52,4


In [55]:
# Ajustando os índices dos DataFrames pois a lib principal usada é a SKForecast
df_dados = df_dados.set_index(pd.to_datetime(df_dados['ds'])).drop(columns=['ds']).copy()
df_dados = df_dados.asfreq("D")
df_dados_log = df_dados_log.set_index(pd.to_datetime(df_dados_log['ds'])).drop(columns=['ds']).copy()
df_dados_log = df_dados_log.asfreq("D")

df_2023 = df_2023.set_index(pd.to_datetime(df_2023['ds'])).drop(columns=['ds']).copy()
df_2023 = df_2023.asfreq("D")
df_2023_log = df_2023_log.set_index(pd.to_datetime(df_2023_log['ds'])).drop(columns=['ds']).copy()
df_2023_log = df_2023_log.asfreq("D")

In [56]:
# df_train, df_test = temporal_train_test_split(
#     df_aux_crpd,
#     test_size=0.2,
#     anchor="start"
# )

In [57]:
# plot_divisao_treino_teste(
#     df_treino=df_train,
#     df_teste=df_test,
#     col_data="ds",
#     col_plot="y",
#     salvar=SALVAR_PLOTS
# )

# sem chuva, com dados categóricos

O DataFrame tem apenas as colunas "ds", "unique_id", "y". Foram removidas as colunas de vazão à montante e as colunas de chuva.

## StatsForecast

### Baseline - SeasonalNaive

In [58]:
for f in fh_v:
    # Realiza a separação dos dados neste ponto para cada horizonte de previsão desejado
    treino = df_dados.iloc[:-f, :].copy()
    teste = df_dados.iloc[-f:, :].copy()

    # Exigência da lib MLForecast
    treino = treino.reset_index()
    teste = teste.reset_index()
    treino["unique_id"] = 1
    teste["unique_id"] = 1

    modelo = SeasonalNaive(season_length=365) # Sazonalidade de 365 dias

    stfc = StatsForecast(
        df=treino.drop(columns=colunas_chuva),
        models=[modelo],
        freq="D",
        n_jobs=8
    )

    df_futr = teste.drop(columns=target+colunas_chuva).copy()
    y_pred = stfc.forecast(
        h=f,
        X_df=df_futr, # passa apenas os atributos categóricos
        level=intervalos_previsao
    )

    df_resultado = pd.merge(
        left=y_pred,
        right=teste[['ds', 'unique_id', 'y']],
        how="left",
        on=['ds', 'unique_id']
    )

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="SN",
        fh=f,
        titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
        niveis=intervalos_previsao_plotar,
        cores=["green"],
        pasta_dstn=pasta_resultados+cenario1,
        salvar=SALVAR_PLOTS
    )

## SKForecast

In [59]:
for f in fh_v:
    treino = df_dados_log.iloc[:-f, :].copy() 
    teste = df_dados_log.iloc[-f:, :].copy()

    modelos = {
        "LinearRegression"  : [LinearRegression(), "LR"],
        "CatBoostRegressor" : [CatBoostRegressor(random_seed=SEED, cat_features=atributos_categoricos, verbose=False), "CB"],
        "LGBMRegressor"     : [LGBMRegressor(random_state=SEED, objective='regression', verbose=-1), "LGBM"]
    }

    # Isso é exclusivo para o LightGBM, passado no momento do 'fit'.
    lgbm_fit_arg = {"categorical_feature": atributos_categoricos}

    for chave, item in modelos.items():
        forecaster = ForecasterAutoreg(
            regressor=item[0],
            lags=look_back,
            fit_kwargs=lgbm_fit_arg if chave == "LGBMRegressor" else None
        )

        # Backtesting quando passa "step=f" dá o mesmo que executar apenas o "predict"
        metrics_baseline, y_pred = backtesting_forecaster(
            forecaster=forecaster,
            y=df_dados_log[target].y,
            exog=df_dados_log[atributos_categoricos],
            steps=f,
            metric='mean_absolute_percentage_error',
            initial_train_size=len(treino),
            refit=True,
            n_jobs='auto',
            interval=[5, 95],
            verbose=False,
            show_progress=False
        )

        df_resultado = pd.merge(
            left=teste.y,
            right=y_pred,
            left_index=True,
            right_index=True
        )
      
        # Precisa ajustar os nomes das colunas para plotar
        df_resultado = df_resultado.rename(columns={
            "pred": "{}".format(chave),
            "lower_bound": "{c}-lo-{i}".format(c=chave, i=intervalos_previsao_plotar[0]),
            "upper_bound": "{c}-hi-{i}".format(c=chave, i=intervalos_previsao_plotar[0])
        })

        # Antes de plotar, vou retornar da transformação logarítmica
        for c in df_resultado.select_dtypes(include=['float64', 'int64']).columns.to_list():
            df_resultado[c] = np.expm1(df_resultado[c])
      
        plot_resultados(
            df_merged=df_resultado,
            modelo="{}".format(chave),
            nome_curto=item[1],
            fh=f,
            titulo="{md} (fh={fh}) ({c})".format(md=chave, fh=f, c=cenario1),
            niveis=intervalos_previsao_plotar,
            cores=["green"],
            pasta_dstn=pasta_resultados+cenario1,
            indice_ds=True,
            salvar=SALVAR_PLOTS
        )

Tirando alguns horizontes, parece que o modelo a ser superado é o LinearRegression

## MLForecast

### Baseline - Decision Tree

O emprego de Árvore de Decisão deve-se à característica do modelo em ser agnóstico à escala dos dados.

In [60]:
# for f in fh_v:
#     # Realiza a separação dos dados neste ponto para cada horizonte de previsão desejado
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     dt = DecisionTreeRegressor(random_state=SEED)

#     fcst = mlf.MLForecast(
#         models=[dt],
#         freq="D",
#         lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
#         num_threads=8,
#         date_features=atributos_data,
#     )

#     fcst.fit(
#         df=treino.drop(columns=colunas_chuva+colunas_vazao),
#         id_col="unique_id",
#         time_col="ds",
#         target_col="y",
#         static_features=[],
#         prediction_intervals=PredictionIntervals(h=f, n_windows=n_folds),
#     )

#     df_futr = teste.drop(columns=['y']+colunas_chuva+colunas_vazao).copy()
#     previsoes = fcst.predict(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao,
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left"
#     )
    
#     df_resultado.rename(
#         columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
#         inplace=True,
#     )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo="DecisionTree",
#         nome_curto="DT",
#         fh=f,
#         titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario1),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario1,
#         salvar=SALVAR_PLOTS
#     )

## NeuralForecast

### Main model - LSTM

Este é o modelo que se pretende aplicar no trabalho

In [61]:
# # Usando atributos OHE e LB (Label Encoded -> "dayofyear")
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()
 
#     treino_dl = gerar_atributos_data(
#         df=treino.drop(columns=colunas_vazao+colunas_chuva),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     teste_dl = gerar_atributos_data(
#         df=teste.drop(columns=colunas_vazao+colunas_chuva),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     # print(treino_dl)
#     # print(teste.drop(columns=['y', 'unique_id', 'ds']+colunas_chuva+colunas_vazao).columns.to_list())

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "scaler_type": None,
#         "futr_exog_list": atributos_ohe + atributos_data,
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D",
#         local_scaler_type="minmax"
#     )

#     nf.fit(
#         df=treino_dl,
#         val_size=2*f,
#     )

#     df_futr = teste_dl.drop(columns=['y']).copy()
#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=df_futr),
#         right=teste_dl[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario1,
#         salvar=SALVAR_PLOTS
#     )

In [62]:
# # Sem usar atributos OHE
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()
 
#     treino_dl = gerar_atributos_data(
#         df=treino.drop(columns=colunas_vazao+colunas_chuva+atributos_ohe),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     teste_dl = gerar_atributos_data(
#         df=teste.drop(columns=colunas_vazao+colunas_chuva+atributos_ohe),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     # print(teste_dl)

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "scaler_type": None,
#         "futr_exog_list": atributos_data,
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D",
#         local_scaler_type='minmax'
#     )

#     nf.fit(
#         df=treino_dl,
#         val_size=2*f,
#     )

#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=teste_dl.drop(columns=['y'])),
#         right=teste_dl[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario1,
#         salvar=SALVAR_PLOTS
#     )

### Main model padrão com CV

In [63]:
# # Usando OHE

# for f in fh_v:
#     tscv = TimeSeriesSplit(
#         n_splits=n_folds,
#         test_size=f
#     )

#     dados_validacao = gerar_atributos_data(
#         df=df_dados.tail(f).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     )
#     dados_validacao = dados_validacao.drop(columns=colunas_chuva+colunas_vazao)#[["ds", "unique_id", "y"]]
#     # print(dados_validacao)

#     dados_cv = gerar_atributos_data(
#         df=df_dados.drop(index=dados_validacao.index).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     )
#     dados_cv = dados_cv.drop(columns=colunas_chuva+colunas_vazao)#[["ds", "unique_id", "y"]]
#     # print(dados_cv)

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "scaler_type": None,
#         "futr_exog_list": atributos_ohe + atributos_data,
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D",
#         local_scaler_type="minmax"
#     )

#     # # #
#     for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
#         treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]
#         nf.fit(df=treino_cv, val_size=2*f)
#     # # #

#     nf.fit(
#         df=dados_cv,
#         val_size=2*f
#     )

#     df_futr = dados_validacao.drop(columns=['y']).copy()
#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=df_futr),
#         right=dados_validacao[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="CV: {md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario1),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario1,
#         salvar=SALVAR_PLOTS
#     )

# com chuva, com dados categóricos

Tem as features exógenas apenas de chuva, sem dados de vazão à montante.

## StatsForecast

### Baseline - SeasonalNaive

Isso não é um preditor de fato. Ao menos, não se considera assim. Serve como uma baseline a superar.

In [64]:
for f in fh_v:
    # Realiza a separação dos dados neste ponto para cada horizonte de previsão desejado
    treino = df_dados.iloc[:-f, :].copy()
    teste = df_dados.iloc[-f:, :].copy()

    # Exigência da lib MLForecast
    treino = treino.reset_index()
    teste = teste.reset_index()
    treino["unique_id"] = 1
    teste["unique_id"] = 1

    modelo = SeasonalNaive(season_length=365) # Sazonalidade de 365 dias

    stfc = StatsForecast(
        df=treino,
        models=[modelo],
        freq="D",
        n_jobs=8
    )

    df_futr = teste.drop(columns=target).copy()
    previsoes = stfc.forecast(
        h=f,
        X_df=df_futr, # passa apenas os atributos categóricos
        level=intervalos_previsao
    )

    df_resultado = pd.merge(
        left=previsoes,
        right=teste[['ds', 'unique_id', 'y']],
        how="left",
        on=['ds', 'unique_id']
    )

    plot_resultados(
        df_merged=df_resultado,
        modelo=modelo.alias,
        nome_curto="SN",
        fh=f,
        titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario2),
        niveis=intervalos_previsao_plotar,
        cores=["green"],
        pasta_dstn=pasta_resultados+cenario2,
        salvar=SALVAR_PLOTS
    )

## SKForecast

In [115]:
for f in fh_v:
    treino = df_dados_log.iloc[:-f, :].copy() 
    teste = df_dados_log.iloc[-f:, :].copy()

    modelos = {
        "LinearRegression": [LinearRegression(), "LR"],
        "CatBoostRegressor": [CatBoostRegressor(random_seed=SEED, cat_features=atributos_categoricos, verbose=False), "CB"],
        "LGBMRegressor": [LGBMRegressor(random_state=SEED, objective='regression', verbose=-1), "LGBM"]
    }

    # Isso é exclusivo para o LightGBM, passado no momento do 'fit'.
    lgbm_fit_arg = {"categorical_feature": atributos_categoricos}

    for chave, item in modelos.items():
        forecaster = ForecasterAutoreg(
            regressor=item[0],
            lags=look_back,
            fit_kwargs=lgbm_fit_arg if chave == "LGBMRegressor" else None
        )

        # Backtesting quando passa "step=f" dá o mesmo que executar apenas o "predict"
        metrics_baseline, y_pred = backtesting_forecaster(
            forecaster=forecaster,
            y=df_dados_log[target].y,
            exog=df_dados_log.drop(columns=target),
            steps=f,
            metric='mean_absolute_percentage_error',
            initial_train_size=len(treino),
            refit=True,
            n_jobs='auto',
            interval=[5, 95],
            verbose=False,
            show_progress=False
        )

        df_resultado = pd.merge(
            left=teste.y,
            right=y_pred,
            left_index=True,
            right_index=True
        )
      
        # Precisa ajustar os nomes das colunas para plotar
        df_resultado = df_resultado.rename(columns={
            "pred": "{}".format(chave),
            "lower_bound": "{c}-lo-{i}".format(c=chave, i=intervalos_previsao_plotar[0]),
            "upper_bound": "{c}-hi-{i}".format(c=chave, i=intervalos_previsao_plotar[0])
        })

        # Antes de plotar, vou retornar da transformação logarítmica
        for c in df_resultado.select_dtypes(include=['float64', 'int64']).columns.to_list():
            df_resultado[c] = np.expm1(df_resultado[c])
      
        plot_resultados(
            df_merged=df_resultado,
            modelo="{}".format(chave),
            nome_curto=item[1],
            fh=f,
            titulo="{md} (fh={fh}) ({c})".format(md=chave, fh=f, c=cenario2),
            niveis=intervalos_previsao_plotar,
            cores=["green"],
            pasta_dstn=pasta_resultados+cenario2,
            indice_ds=True,
            salvar=False #SALVAR_PLOTS
        )


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.




y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.



Os resultados com os modelos boost deram uma melhorada com a adição de variáveis de chuva.

Procedendo com otimização de hiperparâmetros.

### Otimização de hiperparâmetros

#### CatBoost

In [88]:
def cb_search_space(trial):
    search_space  = {
        'depth'             : trial.suggest_int('depth', 2, 12),
        'learning_rate'     : trial.suggest_float('learning_rate', 1e-5, 5e-1),
        'colsample_bylevel' : trial.suggest_float('colsample_bylevel', 0.05, 1.0),
        'subsample'         : trial.suggest_float('subsample', 0.05, 1.0),

        # Parâmetro para o Forecaster
        'lags'             : trial.suggest_categorical('lags', [3, 5, 7, 14, 21]),
    }
    return search_space

f = 1 # fh = fh_v[0]
n_folds = 10

mdl_cb = CatBoostRegressor( # <https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/#which-catboost-hyperparameters-should-i-tune>
    random_seed=SEED,
    loss_function='MAPE',
    iterations=1000,
    cat_features=atributos_categoricos,
    verbose=False
)

fc_bs = ForecasterAutoreg(
    regressor=mdl_cb,
    lags=look_back, # obrigatório estar aqui, mas é alterado pelo otimizador
  )

df_resultados_cbopt, best_trial = bayesian_search_forecaster(
    y=df_dados_log.y,
    exog=df_dados_log.drop(columns=target),
    forecaster=fc_bs,
    search_space=cb_search_space,
    steps=f,
    refit=True,
    n_trials=50,
    metric='mean_absolute_percentage_error',
    initial_train_size=len(df_dados_log.iloc[:(-n_folds*f), :]),
    fixed_train_size=False,
    return_best=True,
    n_jobs=-1,
    random_state=SEED,
    verbose=False,
    show_progress=True,
    engine='optuna',
    output_file=pasta_resultados+cenario2+"/opt/cb_opt_fh1.txt",
    kwargs_create_study={
        'study_name' : 'cb_opt_fh1',
        'direction'  : 'minimize',
    },
    kwargs_study_optimize={'catch' : (FloatingPointError, ValueError, RuntimeError)},
)

  0%|          | 0/50 [00:00<?, ?it/s]

[I 2024-08-06 11:43:55,988] Trial 0 finished with value: 0.03389651502131044 and parameters: {'depth': 2, 'learning_rate': 0.1493797966246757, 'colsample_bylevel': 0.10359337014100449, 'subsample': 0.5085525757323047, 'lags': 5}. Best is trial 0 with value: 0.03389651502131044.
[I 2024-08-06 11:44:52,990] Trial 1 finished with value: 0.04519842061813604 and parameters: {'depth': 9, 'learning_rate': 0.025876865585929585, 'colsample_bylevel': 0.520741923071318, 'subsample': 0.6793751937428989, 'lags': 3}. Best is trial 0 with value: 0.03389651502131044.
[I 2024-08-06 11:45:01,769] Trial 2 finished with value: 0.047874959879585996 and parameters: {'depth': 10, 'learning_rate': 0.11309110490253665, 'colsample_bylevel': 0.09680219868640615, 'subsample': 0.16005472532657733, 'lags': 3}. Best is trial 0 with value: 0.03389651502131044.
[I 2024-08-06 11:46:13,757] Trial 3 finished with value: 0.060096809539502936 and parameters: {'depth': 10, 'learning_rate': 0.13146456082409055, 'colsample_by


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



[I 2024-08-06 11:58:55,143] Trial 20 finished with value: 0.09342052258825864 and parameters: {'depth': 12, 'learning_rate': 0.43731954779794924, 'colsample_bylevel': 0.3363780863510946, 'subsample': 0.5941537335904823, 'lags': 14}. Best is trial 12 with value: 0.02499919541816794.
[I 2024-08-06 11:59:11,886] Trial 21 finished with value: 0.037877168903689376 and parameters: {'depth': 6, 'learning_rate': 0.3270852102310593, 'colsample_bylevel': 0.6109041956402428, 'subsample': 0.3490672975849232, 'lags': 7}. Best is trial 12 with value: 0.02499919541816794.
[I 2024-08-06 11:59:23,440] Trial 22 finished with value: 0.03270769891331744 and parameters: {'depth': 5, 'learning_rate': 0.2737261686337303, 'colsample_bylevel': 0.583006857744689, 'subsample': 0.29647699944806943, 'lags': 7}. Best is trial 12 with value: 0.02499919541816794.
[I 2024-08-06 11:59:32,053] Trial 23 finished with value: 0.0322609953778797 and parameters: {'depth': 4, 'learning_rate': 0.2025517918879732, 'colsample_by

##### Walk-Forward Validation

Uma vez com os parâmetros ótimos em mãos, executa o que é considerado o padrão ouro na avaliação de modelos de previsão: executa um fit passo-a-passo num longo período de avaliação.

No caso em questão, irei executar um WFV ao longo do ano de 2023 inteiro, prevendo um dia por vez.

In [94]:
f = 1

mdl_cb_opt = CatBoostRegressor(
    random_seed=SEED,
    loss_function='MAPE',
    iterations=1000,
    cat_features=atributos_categoricos,
    verbose=False,

    # Parametros otimos encontrados
    depth=6,
    learning_rate=0.17526798394667426,
    colsample_bylevel=0.2631311881828603,
    subsample=0.9977519286505752,
)

forecaster = ForecasterAutoreg(
    regressor=mdl_cb_opt,
    lags=5,
)

# Backtesting quando passa "step=f" dá o mesmo que executar apenas o "predict"
_, y_pred = backtesting_forecaster(
    forecaster=forecaster,
    y=df_aux_log[target].y,
    exog=df_aux_log.drop(columns=target),
    steps=f,
    metric='mean_absolute_percentage_error',
    initial_train_size=len(df_dados_log),
    refit=True,
    n_jobs=-1,
    interval=[5, 95],
    verbose=False,
    show_progress=False
)

df_resultado = pd.merge(
    left=df_2023_log.y,
    right=y_pred,
    left_index=True,
    right_index=True
)

# Precisa ajustar os nomes das colunas para plotar
df_resultado = df_resultado.rename(columns={
    "pred": "{}".format("CatBoostRegressor"),
    "lower_bound": "{c}-lo-{i}".format(c="CatBoostRegressor", i=intervalos_previsao_plotar[0]),
    "upper_bound": "{c}-hi-{i}".format(c="CatBoostRegressor", i=intervalos_previsao_plotar[0])
})

# Antes de plotar, vou retornar da transformação logarítmica
for c in df_resultado.select_dtypes(include=['float64', 'int64']).columns.to_list():
    df_resultado[c] = np.expm1(df_resultado[c])

plot_resultados(
    df_merged=df_resultado,
    modelo="{}".format("CatBoostRegressor"),
    nome_curto="CB_WFV",
    fh=f,
    titulo="{md} (fh={fh}) ({c})".format(md="CatBoostRegressor WFV", fh=f, c=cenario2),
    # niveis=intervalos_previsao_plotar,
    niveis=None,
    cores=["green"],
    pasta_dstn=pasta_resultados+cenario2,
    indice_ds=True,
    salvar=False #SALVAR_PLOTS
)


The forecaster will be fit 365 times. This can take substantial amounts of time. If not feasible, try with `refit = False`.
 


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.



#### LightGBM

In [117]:
def lgbm_search_space(trial):
    search_space  = {
        # 'max_depth'        : trial.suggest_int('max_depth', 2, 12),
        'learning_rate'    : trial.suggest_float('learning_rate', 1e-5, 5e-1),
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.05, 1.0),
        'subsample'        : trial.suggest_float('subsample', 0.05, 1.0),

        # Parâmetro para o Forecaster
        'lags'             : trial.suggest_categorical('lags', [3, 5, 7, 14, 21]),
    }
    return search_space

f = 1 # fh = fh_v[0]
n_folds = 10
lgbm_fit_arg = {"categorical_feature": atributos_categoricos}

mdl_lgbm = LGBMRegressor( # ['estacao', 'month', 'dayofyear', 'week', 'quarter']
    random_seed=SEED,
    objective='regression',
    max_bin=365,
    verbose=1
)

fc_bs = ForecasterAutoreg(
    regressor=mdl_lgbm,
    lags=look_back, # obrigatório estar aqui, mas é alterado pelo otimizador
    fit_kwargs=lgbm_fit_arg
  )

df_resultados_lgbmopt, best_trial = bayesian_search_forecaster(
    y=df_dados_log.y,
    exog=df_dados_log.drop(columns=target),
    forecaster=fc_bs,
    search_space=lgbm_search_space,
    steps=f,
    refit=True,
    n_trials=50,
    metric='mean_absolute_percentage_error',
    initial_train_size=len(df_dados_log.iloc[:(-n_folds*f), :]),
    fixed_train_size=False,
    return_best=True,
    n_jobs=-1,
    random_state=SEED,
    verbose=False,
    show_progress=True,
    engine='optuna',
    output_file=pasta_resultados+cenario2+"/opt/lgbm_opt_fh1.txt",
    kwargs_create_study={
        'study_name' : 'lgbm_opt_fh1',
        'direction'  : 'minimize',
    },
    kwargs_study_optimize={'catch' : (FloatingPointError, ValueError, RuntimeError)},
)

  0%|          | 0/50 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001038 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3219
[LightGBM] [Info] Number of data points in the train set: 3635, number of used features: 14
[LightGBM] [Info] Start training from score 4.335975
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.480375 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3220
[LightGBM] [Info] Number of data points in the train set: 3637, number of used features: 14
[LightGBM] [Info] Start training from score 4.337821
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.531263 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough,

KeyboardInterrupt: 

#### LinearRegression WFV

In [95]:
f = 1

mdl_lr = LinearRegression()

forecaster = ForecasterAutoreg(
    regressor=mdl_lr,
    lags=look_back,
)

# Backtesting quando passa "step=f" dá o mesmo que executar apenas o "predict"
_, y_pred = backtesting_forecaster(
    forecaster=forecaster,
    y=df_aux_log[target].y,
    exog=df_aux_log.drop(columns=target),
    steps=f,
    metric='mean_absolute_percentage_error',
    initial_train_size=len(df_dados_log),
    refit=True,
    n_jobs=-1,
    interval=[5, 95],
    verbose=False,
    show_progress=False
)

df_resultado = pd.merge(
    left=df_2023_log.y,
    right=y_pred,
    left_index=True,
    right_index=True
)

# Precisa ajustar os nomes das colunas para plotar
df_resultado = df_resultado.rename(columns={
    "pred": "{}".format("LinearRegression"),
    "lower_bound": "{c}-lo-{i}".format(c="LinearRegression", i=intervalos_previsao_plotar[0]),
    "upper_bound": "{c}-hi-{i}".format(c="LinearRegression", i=intervalos_previsao_plotar[0])
})

# Antes de plotar, vou retornar da transformação logarítmica
for c in df_resultado.select_dtypes(include=['float64', 'int64']).columns.to_list():
    df_resultado[c] = np.expm1(df_resultado[c])

plot_resultados(
    df_merged=df_resultado,
    modelo="{}".format("LinearRegression"),
    nome_curto="LR_WFV",
    fh=f,
    titulo="{md} (fh={fh}) ({c})".format(md="LinearRegression WFV", fh=f, c=cenario2),
    # niveis=intervalos_previsao_plotar,
    niveis=None,
    cores=["green"],
    pasta_dstn=pasta_resultados+cenario2,
    indice_ds=True,
    salvar=False #SALVAR_PLOTS
)


The forecaster will be fit 365 times. This can take substantial amounts of time. If not feasible, try with `refit = False`.
 


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.


y_pred and y_true do not have the same column index. This may indicate incorrect objects passed to the metric. Indices of y_true will be used for y_pred.



## MLForecast

### Baseline - Decision Tree

O emprego de Árvore de Decisão deve-se à característica do modelo em ser agnóstico à escala dos dados.

In [None]:
# # Teste preliminar

# f = 7 # 7 dias
# teste = df_dados.tail(f).copy()
# treino = df_dados.drop(index=teste.index).copy()

# teste

# # dt = DecisionTreeRegressor(random_state=SEED)

# fcst = mlf.MLForecast(
#     models=[dt],
#     freq="D",
#     lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
#     num_threads=8,
#     date_features=atributos_data,
# )

# # fcst.fit(
# #     df=treino.drop(columns=colunas_vazao),
# #     id_col="unique_id",
# #     time_col="ds",
# #     target_col="y",
# #     static_features=[],
# #     prediction_intervals=PredictionIntervals(h=f),
# # )

# # df_futr = teste.drop(columns=['y'] + colunas_vazao).copy()

# # previsoes = fcst.predict(
# #     h=f,
# #     X_df=df_futr,
# #     level=intervalos_previsao,
# # )

# # df_resultado = pd.merge(
# #     left=previsoes,
# #     right=teste[["ds", "unique_id", "y"]],
# #     on=["ds", "unique_id"],
# #     how="left"
# # )

# # df_resultado.rename(
# #     columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
# #     inplace=True,
# # )

# # plot_resultados(
# #     df_merged=df_resultado,
# #     modelo="DecisionTree",
# #     nome_curto="DT",
# #     fh=f,
# #     titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario2),
# #     niveis=intervalos_previsao_plotar,
# #     cores=["green", "blue"],
# #     pasta_dstn=pasta_resultados+cenario2,
# #     salvar=SALVAR_PLOTS
# # )

In [None]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     dt = DecisionTreeRegressor(random_state=SEED)

#     fcst = mlf.MLForecast(
#         models=[dt],
#         freq="D",
#         lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
#         num_threads=8,
#         date_features=atributos_data,
#     )

#     fcst.fit(
#         df=treino.drop(columns=colunas_vazao),
#         id_col="unique_id",
#         time_col="ds",
#         target_col="y",
#         static_features=[],
#         prediction_intervals=PredictionIntervals(h=f),
#     )

#     df_futr = teste.drop(columns=['y'] + colunas_vazao).copy()

#     previsoes = fcst.predict(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao,
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left"
#     )

#     df_resultado.rename(
#         columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
#         inplace=True,
#     )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo="DecisionTree",
#         nome_curto="DT",
#         fh=f,
#         titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario2),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario2,
#         salvar=SALVAR_PLOTS
#     )

### LightGBM

In [None]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     lgbm = LGBMRegressor(random_state=SEED)

#     fcst = mlf.MLForecast(
#         models=[lgbm],
#         freq="D",
#         lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
#         num_threads=8,
#         date_features=atributos_data,
#     )

#     fcst.fit(
#         df=treino.drop(columns=colunas_vazao),
#         id_col="unique_id",
#         time_col="ds",
#         target_col="y",
#         static_features=[],
#         prediction_intervals=PredictionIntervals(h=f, n_windows=n_folds),
#     )

#     df_futr = teste.drop(columns=['y'] + colunas_vazao).copy()
#     previsoes = fcst.predict(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao,
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left"
#     )

#     # df_resultado.rename(
#     #     columns=lambda x: re.sub('LGBMRegressor', 'LightGBM', x),
#     #     inplace=True,
#     # )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo="LGBMRegressor",
#         nome_curto="LGBM",
#         fh=f,
#         titulo="LGBMRegressor (fh={fh}) ({c})".format(fh=f, c=cenario2),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario2,
#         salvar=SALVAR_PLOTS
#     )

## NeuralForecast

### Main model - LSTM

Este é o modelo que se pretende aplicar no trabalho

In [None]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()
    
#     treino_dl = gerar_atributos_data(
#         df=treino.drop(columns=colunas_vazao),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     teste_dl = gerar_atributos_data(
#         df=teste.drop(columns=colunas_vazao),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "scaler_type": "minmax",
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D"
#     )

#     nf.fit(
#         df=treino_dl,
#         val_size=2*f,
#     )

#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=teste_dl.drop(columns=['y'])),
#         right=teste_dl[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario2),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario2,
#         salvar=SALVAR_PLOTS
#     )

### Main model padrão com CV

In [None]:
# for f in fh_v:
#     tscv = TimeSeriesSplit(
#         n_splits=n_folds,
#         test_size=f
#     )

#     #   Este DataFrame servirá apenas para verificar se usando CV o modelo ajustará os pesos na moral
#     # e como se comportará com dados novos nunca vistos antes.
#     dados_validacao = gerar_atributos_data(
#         df=df_dados.tail(f).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     ).drop(columns=colunas_vazao)

#     # Com estes dados restantes eu realizo a validação cruzada.
#     dados_cv = gerar_atributos_data(
#         df=df_dados.drop(index=dados_validacao.index).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     ).drop(columns=colunas_vazao)

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "scaler_type": "minmax",
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D"
#     )

#     # # #
#     for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
#         treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]
#         nf.fit(df=treino_cv, val_size=2*f)
#     # # #
    
#     nf.fit(
#         df=dados_cv,
#         val_size=2*f,
#     )

#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=dados_validacao.drop(columns=['y'])),
#         right=dados_validacao[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="CV: {md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario2),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario2,
#         salvar=SALVAR_PLOTS
#     )

# Séries com Chuva, com Vazão

## StatsForecast

### Baseline - SeasonalNaive

Isso não é um preditor de fato. Ao menos, não se considera assim. Serve como uma baseline a superar.

In [None]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     modelo = SeasonalNaive(season_length=365)

#     stfc = StatsForecast(
#         df=treino,
#         models=[modelo],
#         freq="D",
#         n_jobs=8
#     )

#     df_futr = cria_dataframe_futuro(
#         df_futr=fcst.make_future_dataframe(h=f),
#         df_train=treino,
#         df_test=teste,
#         tp_valor='ml',
#         n_lags=look_back,
#         date_features=atributos_data,
#         cols=colunas_vazao
#     )
    
#     previsoes = stfc.forecast(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[['ds', 'unique_id', 'y']],
#         how="left",
#         on=['ds', 'unique_id']
#     )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="SN",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=False, #SALVAR_PLOTS
#     )

## MLForecast

### Baseline - Decision Tree

O emprego de Árvore de Decisão deve-se à característica do modelo em ser agnóstico à escala dos dados.

In [None]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()

#     dt = DecisionTreeRegressor(random_state=SEED)

#     fcst = mlf.MLForecast(
#         models=[dt],
#         freq="D",
#         lags=[i + 1 for i in range(look_back)], # lags apenas na coluna target "y"
#         num_threads=8,
#         date_features=atributos_data,
#     )

#     fcst.fit(
#         df=treino,
#         id_col="unique_id",
#         time_col="ds",
#         target_col="y",
#         static_features=[],
#         prediction_intervals=PredictionIntervals(h=f),
#     )

#     df_futr = cria_dataframe_futuro(
#         df_futr=fcst.make_future_dataframe(h=f),
#         df_train=treino,
#         df_test=teste,
#         tp_valor='ml',
#         n_lags=look_back,
#         date_features=atributos_data,
#         cols=colunas_vazao
#     )

#     previsoes = fcst.predict(
#         h=f,
#         X_df=df_futr,
#         level=intervalos_previsao,
#     )

#     df_resultado = pd.merge(
#         left=previsoes,
#         right=teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left"
#     )

#     df_resultado.rename(
#         columns=lambda x: re.sub('DecisionTreeRegressor', 'DecisionTree', x),
#         inplace=True,
#     )

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo="DecisionTree",
#         nome_curto="DT",
#         fh=f,
#         titulo="DecisionTree (fh={fh}) ({c})".format(fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=SALVAR_PLOTS
#     )

## NeuralForecast

### Main model - LSTM

Este é o modelo que se pretende aplicar no trabalho

In [None]:
# for f in fh_v:
#     teste = df_dados.tail(f).copy()
#     treino = df_dados.drop(index=teste.index).copy()
    
#     treino_dl = gerar_atributos_data(
#         df=treino,
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     teste_dl = gerar_atributos_data(
#         df=teste,
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "scaler_type": "minmax",
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D"
#     )

#     nf.fit(
#         df=treino_dl,
#         val_size=2*f,
#     )

#     # df_futr_dl = cria_dataframe_futuro(
#     #     df_futr=nf.make_future_dataframe(),
#     #     df_train=treino_dl,
#     #     df_test=teste_dl,
#     #     tp_valor='ml',
#     #     n_lags=look_back,
#     #     date_features=atributos_data,
#     #     cols=colunas_vazao
#     # )

#     df_resultado = pd.merge(
#         left=nf.predict(futr_df=teste_dl.drop(columns=['y']+colunas_vazao)),
#         # left=nf.predict(futr_df=df_futr_dl),
#         right=teste_dl[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})

#     # df_resultado = df_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="{md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=SALVAR_PLOTS
#     )

### Main model padrão com CV

In [None]:
# for f in fh_v:
#     tscv = TimeSeriesSplit(
#         n_splits=n_folds,
#         test_size=f
#     )

#     #   Este DataFrame servirá apenas para verificar se usando CV o modelo ajustará os pesos na moral
#     # e como se comportará com dados novos nunca vistos antes.
#     dados_validacao = gerar_atributos_data(
#         df=df_dados.tail(f).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     # Com estes dados restantes eu realizo a validação cruzada.
#     dados_cv = gerar_atributos_data(
#         df=df_dados.drop(index=dados_validacao.index).copy(),
#         atributos=atributos_data,
#         col_data="ds"
#     )

#     prmtrs_rede = {
#         "h": f,
#         "random_seed": SEED,
#         "context_size": look_back,
#         "loss": HuberMQLoss(level=intervalos_previsao),
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "scaler_type": "minmax",
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 1000,
#         "early_stop_patience_steps": 5,
#         "val_check_steps": 25,
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     modelo = LSTM(**prmtrs_rede)

#     nf = NeuralForecast(
#         models=[modelo],
#         freq="D"
#     )

#     # # #
#     for fold, (treino_ind, teste_ind) in enumerate(tscv.split(dados_cv)):
#         treino_cv, teste_cv = dados_cv.iloc[treino_ind], dados_cv.iloc[teste_ind]
#         nf.fit(df=treino_cv, val_size=2*f)
#     # # #

#     # Com o modelo "fittado", faço a previsão em cima do conjunto de validação
#     nf.fit(
#         df=dados_cv,
#         val_size=2*f,
#     )

#     # futr_dl = cria_dataframe_futuro(
#     #     df_futr=nf.make_future_dataframe(),
#     #     df_train=dados_cv,
#     #     df_test=dados_validacao,
#     #     tp_valor='ml',
#     #     n_lags=look_back,
#     #     date_features=atributos_data,
#     #     cols=colunas_vazao
#     # )

#     df_resultado = pd.merge(
#         # left=nf.predict(futr_df=futr_dl),
#         left=nf.predict(futr_df=dados_validacao.drop(columns=['y'] + colunas_vazao)),
#         right=dados_validacao[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     ).rename(columns={modelo.alias+"-median" : "LSTM"})
    
#     # df_resultado = df_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

#     plot_resultados(
#         df_merged=df_resultado,
#         modelo=modelo.alias,
#         nome_curto="LSTM",
#         fh=f,
#         titulo="CV: {md} (fh={fh}) ({c})".format(md=modelo.alias, fh=f, c=cenario3),
#         niveis=intervalos_previsao_plotar,
#         cores=["green", "blue"],
#         pasta_dstn=pasta_resultados+cenario3,
#         salvar=SALVAR_PLOTS
#     )

# Simulando o uso

Neste ponto, o que se busca é reproduzir o uso diário que o trabalho (provavelmente) teria no IGAM.

Ao longo do ano de 2023 inteiro o modelo, otimizado, será executado. Será um "fit/predict" diário, dentro de um loop, para ver o comportamento do modelo ao longo do tempo. Se degenerará, em quanto tempo isso ocorrerá e outras análises também que se fizerem pertinentes.

Pois bem. O procedimento será:

- Otimizar a escolha de hiperparâmetros para a rede.
- Usando validação cruzada combinada com Optuna na massa de dados de 2013 a 2022, perfazendo 10 anos. O horizonte de previsão será sempre de 1 dia. O número de 'folds' será deliberado em 5 folds.

## Walk-forward Validation com janela expandida

In [None]:
# def opt_lstm_cv(trial, fh, n_folds, dataset):
#     tscv = TimeSeriesSplit(
#         n_splits=n_folds,
#         test_size=fh
#     )

#     fixo_prmtrs = {
#         "h": fh,
#         # "loss": HuberMQLoss(level=intervalos_previsao),
#         "random_seed": SEED,
#         "hist_exog_list": colunas_vazao,
#         "futr_exog_list": colunas_chuva + atributos_data,
#         "logger": False,
#         "alias": "LSTM",
#         "max_steps": 200,
#         "context_size": look_back,
#         "scaler_type": "minmax",
#         "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#         "enable_progress_bar": False,
#     }

#     opt_prmtrs = {
#         "encoder_n_layers": trial.suggest_int("encoder_n_layers", 1, 6),
#         "encoder_hidden_size": trial.suggest_int("encoder_hidden_size", 8, 256, step=4),
#         "encoder_dropout" : trial.suggest_float("encoder_dropout", 0.05, 0.25),
#         "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-1),
#     }

#     erros = []
#     for _, (treino_ind, teste_ind) in enumerate(tscv.split(dataset)):
#         treino, teste = dataset.iloc[treino_ind], dataset.iloc[teste_ind]

#         modelo = LSTM(
#             **fixo_prmtrs,
#             **opt_prmtrs
#         )

#         nf = NeuralForecast(
#             models=[modelo],
#             freq="D"
#         )

#         nf.fit(df=treino)

#         # df_futr_dl = cria_dataframe_futuro(
#         #     df_futr=nf.make_future_dataframe(),
#         #     df_train=treino,
#         #     df_test=teste,
#         #     tp_valor='ml',
#         #     n_lags=look_back,
#         #     date_features=atributos_data,
#         #     cols=colunas_vazao
#         # )

#         df_resultado = pd.merge(
#             # left=nf.predict(futr_df=df_futr_dl),
#             left=nf.predict(futr_df=teste.drop(columns=['y'] + colunas_vazao)),
#             right=teste[["ds", "unique_id", "y"]],
#             on=["ds", "unique_id"],
#             how="left",
#         )
        
#         try:
#             # erros.append(smape(df_resultado["y"], df_resultado[modelo.alias+'-median']))
#             erros.append(smape(df_resultado["y"], df_resultado[modelo.alias]))
#         except ValueError:
#             erros.append(1e+3)

#     return np.mean(erros)
# #################################################
# dados_dl = gerar_atributos_data(
#     df=df_dados,
#     atributos=atributos_data,
#     col_data="ds"
# )

# dir_final = pasta_resultados+cenario3+"/lstm_opt_cv/"
# if not os.path.exists(dir_final):
#     os.makedirs(dir_final)

# nome_estudo="rio_doce_trecho_baixo"
# local_armaz="sqlite:///{}.db".format(nome_estudo)
# lstm_best_trial = {}
# f = 1

# study_lstm_cv = opt.create_study(
#     sampler=opt.samplers.TPESampler(seed=SEED),
#     study_name=nome_estudo,
#     direction="minimize",
#     storage=local_armaz,
# )

# opt_lstm_cv = partial(
#     opt_lstm_cv,
#     fh=f,
#     n_folds=n_folds,
#     dataset=dados_dl
# )

# study_lstm_cv.optimize(
#     func=opt_lstm_cv,
#     n_trials=100,
#     catch=(FloatingPointError, ValueError, RuntimeError),
#     show_progress_bar=True,
#     n_jobs=1,
# )

# lstm_best_trial = {
#     'modelo' : 'LSTM-CV',
#     'fh': f,
#     'best_trial' : study_lstm_cv.best_trial.number,
#     'best_value' : study_lstm_cv.best_value,
#     'best_params' : study_lstm_cv.best_params.copy()
# }

# # Salvar o dicionário "lstm_best_trial" para analisar mais tarde
# exportar_dict_json(
#     v_dict=lstm_best_trial,
#     pasta=dir_final,
#     nome_arq="lstm-cv_best_trial_fh{}.json".format(f)
# )

In [None]:
# # Procedimento para carregar o objeto "study_lstm_cv" (previamente salvo já) para realizar análises

# estudo = opt.create_study(
#     study_name=nome_estudo,
#     storage=local_armaz,
#     load_if_exists=True
# )

In [None]:
# opt.visualization.plot_param_importances(estudo)

In [None]:
# estudo.best_params

In [None]:
# treino_dl = gerar_atributos_data(
#     df=df_dados,
#     atributos=atributos_data,
#     col_data="ds"
# )

# teste_2023 = gerar_atributos_data(
#     df=df_2023,
#     atributos=atributos_data,
#     col_data="ds"
# )

# parametros = {
#     "h": 1,
#     # "loss": HuberMQLoss(level=intervalos_previsao),
#     "random_seed": SEED,
#     "hist_exog_list": colunas_vazao,
#     "futr_exog_list": colunas_chuva + atributos_data,
#     "logger": False,
#     "alias": "LSTM",
#     "max_steps": 200,
#     "context_size": look_back,
#     "scaler_type": "minmax",
#     "accelerator": "gpu" if torch.cuda.is_available() else "cpu",
#     "enable_progress_bar": False,
# }

# # Parâmetros ótimos encontrados para fh=1
# melhores_parametros = estudo.best_params.copy()

# modelo = LSTM(
#     **melhores_parametros,
#     **parametros,
# )

# nf = NeuralForecast(
#     models=[modelo],
#     freq="D"
# )

# nf.fit(df=treino_dl)

# # df_futr_dl = cria_dataframe_futuro(
# #     df_futr=nf.make_future_dataframe(),
# #     df_train=treino_dl,
# #     df_test=teste_2023,
# #     tp_valor='ml',
# #     n_lags=look_back,
# #     date_features=atributos_data,
# #     cols=colunas_vazao
# # )

# df_resultado = pd.merge(
#     # left=nf.predict(futr_df=df_futr_dl),
#     left=nf.predict(futr_df=teste_2023.drop(columns=['y'] + colunas_vazao)),
#     right=teste_2023[["ds", "unique_id", "y"]],
#     on=["ds", "unique_id"],
#     how="left",
# )#.rename(columns={modelo.alias+"-median" : "LSTM"})
# # df_resultado = df_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

# novo_treino = pd.concat([pd.DataFrame(columns=treino_dl.columns), treino_dl])
# novo_teste = pd.DataFrame(columns=teste_2023.columns)
# df_final = df_resultado.copy()

# for i in range(len(teste_2023) - 1):
#     novo_treino = pd.concat([novo_treino, teste_2023.iloc[[i]]])
#     novo_teste = teste_2023.iloc[[i + 1]]

#     nf.fit(df=novo_treino)

#     # novo_df_futr = cria_dataframe_futuro(
#     #     df_futr=nf.make_future_dataframe(),
#     #     df_train=novo_treino,
#     #     df_test=novo_teste,
#     #     tp_valor='ml',
#     #     n_lags=look_back,
#     #     date_features=atributos_data,
#     #     cols=colunas_vazao
#     # )

#     novo_resultado = pd.merge(
#         # left=nf.predict(futr_df=novo_df_futr),
#         left=nf.predict(futr_df=novo_teste.drop(columns=['y'] + colunas_vazao)),
#         right=novo_teste[["ds", "unique_id", "y"]],
#         on=["ds", "unique_id"],
#         how="left",
#     )#.rename(columns={modelo.alias+"-median" : "LSTM"})
#     # novo_resultado = novo_resultado.rename(columns={modelo.alias+"-median" : "LSTM"})

#     df_final = pd.concat([df_final, novo_resultado])

In [None]:
# df_final

In [None]:
# from tsfresh.feature_extraction.feature_calculators import number_peaks

# number_peaks(
#     x=df_final.LSTM,
#     n=3
# )

In [None]:
# # Ideia retirada de: <https://plotly.com/python/ml-regression/#simple-actual-vs-predicted-plot>

# salvar = SALVAR_PLOTS

# fig = go.Figure()

# fig.add_trace(
#     go.Scatter(
#         x=df_final["LSTM"],
#         y=df_final["y"],
#         mode="markers",
#         line=dict(color="blue"),
#         hovertemplate="previsão: %{x}<br>observado: %{y}</br><extra></extra>",
#         showlegend=False,
#     )
# )

# fig.add_shape(
#     type="line",
#     line=dict(
#         color='red',
#         dash='dash'
#     ),
#     x0=df_final["y"].min(), y0=df_final["y"].min(),
#     x1=df_final["y"].max(), y1=df_final["y"].max()
# )

# fig.update_xaxes(
#     title=dict(
#         text="Previsão (m³/s)",
#         font=dict(
#             family="system-ui",
#             size=18
#         )
#     ),
#     zerolinecolor="black",
#     showspikes=True,
#     mirror=True,
#     ticks="outside",
#     showline=True,
#     linecolor="black",
# )

# fig.update_yaxes(
#     title=dict(
#         text="Observado (m³/s)",
#         font=dict(
#             family="system-ui",
#             size=18
#         )
#     ),
#     zerolinecolor="black",
#     showspikes=True,
#     mirror=True,
#     ticks="outside",
#     showline=True,
#     linecolor="black",
# )

# fig.update_layout(
#     width=1500,
#     height=700,
#     hovermode="closest",
#     plot_bgcolor="#c8d4e3",
#     title=dict(
#         text="Erro de previsão",
#         font=dict(
#             family="system-ui",
#             size=24
#         ),
#     ),
# )

# if salvar:
#     fig.write_image(
#         pasta_resultados+cenario3+"relacao_observado_previsao.png"
#     )
# else:
#     fig.show()

In [None]:
# plot_resultados(
#     df_merged=df_final.dropna(),
#     modelo="LSTM",
#     nome_curto="LSTM",
#     fh=1,
#     titulo="Resultado final do experimento de caso de uso",
#     pasta_dstn=pasta_resultados,
#     niveis=None,
#     cores=None,
#     salvar=SALVAR_PLOTS,
#     n_decimal=5,
#     metricas="hidrologia",
#     marcadores=False
# )

# FIM