# Imports básicos para todas as análises

In [1]:
import  warnings,                   \
        calendar,                   \
        pandas as pd,               \
        numpy as np,                \
        plotly.graph_objects as go, \
        requests as rt,             \
        mlforecast as mlf,          \
        optuna as opt,              \
        hydrobr as hbr,             \
        xml.etree.ElementTree as ET,\
        utilsforecast.processing as ufp

from typing import List

from datetime import datetime, timedelta

from io import BytesIO

from functools import partial

from plotly.subplots import make_subplots

from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import LinearSVR

# A ser usado apenas para a análise de imputação de dados (ao invés de sempre aplicar o valor médio)
from sklearn.impute import KNNImputer

from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM, NBEATSx

from statsmodels.tsa.stattools import acf, pacf
from statsmodels.tsa.seasonal import seasonal_decompose

from sktime.split import temporal_train_test_split
from sktime.param_est.seasonality import SeasonalityACF
from sktime.param_est.stationarity import StationarityADF
from sktime.performance_metrics.forecasting import MeanSquaredError, MeanAbsolutePercentageError, MeanAbsoluteError

# Desativar as mensagens de 'warning' que ficam poluindo o output de alguns trechos de código.
warnings.filterwarnings("ignore")

# Para com a verborragia do log do Optuna
opt.logging.set_verbosity(opt.logging.WARNING)

# Wraper pra usar a engine do Plotly ao invocar a função "[DataFrame|Series].plot" do Pandas
pd.options.plotting.backend = "plotly"

# Métricas utilizadas
smape = MeanAbsolutePercentageError(symmetric=True) # Melhor valor possível é 0.0 (SYMMETRIC Mean Absolute Percentage Error)
rmse = MeanSquaredError(square_root=True) # Quanto menor, melhor
mae = MeanAbsoluteError() # Quanto menor, melhor



# Utilidades

In [2]:
def carregar_dados(file_name : str,
                   separator : str = "\t",
                   adjust : bool = True,
                   date_column : str = "ds"
                   ) -> pd.DataFrame:
    
    df = pd.read_csv(file_name, sep=separator, index_col=date_column, header=0, parse_dates=[date_column])

    if adjust:
        df = df.resample('D').first() # deixando a série contínua numa base diária

    # Deixando ajustado para usar com as libs Nixtla
    df['unique_id'] = 1
    df.reset_index(inplace=True)

    return df
# ============================================================================================ #
def decomp_series(df) -> None:
    # A decomposição das séries temporais ajuda a detectar padrões (tendência, sazonalidade)
    #   e identificar outras informações que podem ajudar na interpretação do que está acontecendo.

    cols = df.drop(columns=['ds', 'unique_id']).columns.to_list()
    for c in cols:
        # Utilizei modelo do tipo "add" (aditivo) pois tem séries com valores 0 (zero).
        # Período de 365 dias porque o que me interessa é capturar padrões anuais.
        decomp = seasonal_decompose(df[c], period=365, model="add")
        fig_decomp = make_subplots(specs=[[{"secondary_y": True}]])
        fig_decomp.add_trace(go.Scatter(x=df.ds, y=decomp.observed, name='observado', mode='lines', showlegend=True), secondary_y=False)
        fig_decomp.add_trace(go.Scatter(x=df.ds, y=decomp.trend, name='tendência', mode='lines', showlegend=True), secondary_y=True)
        fig_decomp.add_trace(go.Scatter(x=df.ds, y=decomp.seasonal, name='sazonalidade', mode='lines', showlegend=True), secondary_y=True)
        fig_decomp.add_trace(go.Scatter(x=df.ds, y=decomp.resid, name='resíduo', mode='lines', showlegend=True), secondary_y=False)

        fig_decomp.update_yaxes(title=dict(text="observado/resíduo", font=dict(family="system-ui", size=18)), secondary_y=False)
        fig_decomp.update_yaxes(title=dict(text="tendência/sazonalidade", font=dict(family="system-ui", size=18)), secondary_y=True)

        fig_decomp.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))

        # fig_decomp.update_traces(hovertemplate=None)

        fig_decomp.update_layout(width=1500, height=700, #hovermode='x unified',
                                title=dict(text="Decomposição da série temporal: {col}".format(col=c), font=dict(family="system-ui", size=24)))
        
        fig_decomp.write_image("./resultados/trecho_alto/aed/decomposicao_serie_{}.png".format(c))
        # fig_decomp.show()
# ============================================================================================ #
def estacionariedade(df, sp) -> None:
    # Avaliar a estacionariedade de cada uma das séries e a sazonalidade (se houver)
    # Existindo sazonalidade, qual a lag (ou quais lags) se encaixam nesta sazonalidade

    cols = df.drop(columns=['ds', 'unique_id']).columns.to_list()
    for c in cols:
        ts = df[c]
        sty_est = StationarityADF()
        sty_est.fit(ts)
        print(c, sty_est.get_fitted_params()["stationary"])

        # Este teste de sazonalidade deve ser aplicado a séries estacionárias.
        # Se precisar tornar uma série em estacionária, tem de aplicar diferenciação antes.
        if sty_est.get_fitted_params()["stationary"]:
            sp_est = SeasonalityACF(candidate_sp=sp, nlags=len(df[c])) # Minha intenção é ter certeza de que existe sazonalidade anual (365 dias)
            sp_est.fit(ts)
            sp_est.get_fitted_params()
            print(c, sp_est.get_fitted_params()["sp_significant"])
# ============================================================================================ #
def mapa_correlacao(df) -> None:
    corr = df.drop(columns=['ds', 'unique_id']).corr()
    fig = go.Figure()
    fig.add_trace(go.Heatmap(x=corr.columns, y=corr.columns, z=corr, text=corr.values,
                            texttemplate = "%{text:.7f}",
                            textfont = {"size": 14},
                            colorscale="rainbow",
                            hovertemplate = "%{y}<br>%{x}</br><extra></extra>"))
    fig.update_layout(width=1500, height=700,
                        yaxis=dict(tickfont=dict(family="system-ui", size=14)),
                        xaxis=dict(tickfont=dict(family="system-ui", size=14)),
                        title=dict(text="Mapa de correlação", font=dict(family="system-ui", size=24)))
    
    fig.write_image("./resultados/trecho_alto/aed/mapa_correlacao.png")
    # fig.show()
# ============================================================================================ #
def plot_linha_tabela(df_merged,
                      regressor : str,
                      plot_title : str,
                      line_color : str,
                      short_name : str
                      ) -> None:

    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, specs=[[{"type": "scatter"}], [{"type": "table"}]])

    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.y, mode='lines', name='observado', line=dict(color="#000000", width=4)), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged[regressor], mode='lines', name=short_name, line=dict(color=line_color)), row=1, col=1)

    fig.append_trace(go.Table(header=dict(values=["sMAPE", "RMSE", "MAE"], font=dict(size=14), align="left"),
                                cells=dict(values=[smape(df_merged.y, df_merged[regressor]),
                                                   rmse(df_merged.y, df_merged[regressor]),
                                                   mae(df_merged.y, df_merged[regressor])],
                                        font=dict(size=14),
                                        height=24,
                                        align="left")),
                    row=2, col=1)

    fig.update_yaxes(title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)))
    fig.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))

    fig.update_layout(width=1500, height=1000, hovermode='x unified',
                        title=dict(text=plot_title, font=dict(family="system-ui", size=24)))
    
    fig.show()
# ============================================================================================ #
def cria_plot_correlacao(serie : pd.Series,
                         n_lags : int,
                         plot_pacf : bool = False
                         ) -> None:
    corr_array = pacf(serie.dropna(), nlags=n_lags, alpha=0.05) if plot_pacf else acf(serie.dropna(), nlags=n_lags, alpha=0.05)
    lower_y = corr_array[1][:, 0] - corr_array[0]
    upper_y = corr_array[1][:, 1] - corr_array[0]

    fig = go.Figure()
    
    # Desenha as linhas verticais pretas
    [fig.add_scatter(x=(x, x), y=(0, corr_array[0][x]), mode='lines', line_color='black', hovertemplate = "<extra></extra>")
        for x in range(len(corr_array[0]))]
    
    # Desenha as bolinhas vermelhas
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0],
                    mode='markers', marker_color='red', marker_size=12,
                    hovertemplate = "x = %{x}<br>y = %{y}<extra></extra>")
    
    # Desenha a 'nuvem' clarinha acima do eixo x
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y,
                    mode='lines', line_color='rgba(255,255,255,0)',
                    hovertemplate = "<extra></extra>")

    # Desenha a 'nuvem' clarinha abaixo do eixo x
    fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y,
                    mode='lines', fillcolor='rgba(32, 146, 230,0.3)', fill='tonexty', line_color='rgba(255,255,255,0)',
                    hovertemplate = "<extra></extra>")
    
    fig.update_traces(showlegend=False)

    fig.update_xaxes(range=[-1, n_lags+1])
    fig.update_yaxes(zerolinecolor='black') # Quando 'y=0' a linha é preta
    
    title = 'Autocorrelação Parcial (PACF) para n_lags={n}'.format(n=n_lags) if plot_pacf else 'Autocorrelação (ACF) para n_lags={n}'.format(n=n_lags)
    fig.update_layout(width=1500, height=700,
                      title=dict(text=title, font=dict(family="system-ui", size=24)))

    fig.write_image("./resultados/trecho_alto/aed/plot_pacf.png") if plot_pacf else fig.write_image("./resultados/trecho_alto/aed/plot_acf.png")
    # fig.show()
# ============================================================================================ #
def cria_dataframe_futuro(df_futr, df_train, df_test, tp_valor, n_lags, cols) -> pd.DataFrame:
    if tp_valor == 'ultimo': # Usa o último valor conhecido
        for c in cols:
            df_futr[c] = df_train[c].iat[-1]
    elif tp_valor == 'media': # Usa o valor médio de cada coluna vazão
        for c in cols:
            df_futr[c] = df_train[c].mean()
    elif tp_valor == 'ml':
        from mlforecast import MLForecast
        from xgboost import XGBRegressor

        fcst = MLForecast(
            models=XGBRegressor(random_state=5),
            freq='D',
            lags=[i+1 for i in range(n_lags)],
            # target_transforms=[Differences([1])], # aplica uma diferenciação pra certificar de lidar com dados sem tendência
            date_features=['year', 'month', 'quarter', 'dayofyear', 'week']
        )

        for c in cols:
            df_temp = df_train[['ds', 'unique_id', c]]
            fcst.fit(df_temp, id_col='unique_id', time_col='ds', target_col=c, static_features=[])
            df_preds = fcst.predict(h=len(df_futr))
            df_futr[c] = df_preds['XGBRegressor']
    else:
        raise Exception("Opção inválida! (ultimo | media | ml)")
            
    df_futr = pd.merge(left=df_futr, right=df_test.drop(columns=cols+['y']),
                    on=['ds', 'unique_id'], how='left')
    
    return df_futr
# ============================================================================================ #
def distribuicao_dados(df_original, df_media, df_knn) -> None:
    cols = np.asarray(df_original.drop(columns=['ds', 'unique_id']).columns)

    for c in cols:
        fig = go.Figure()

        fig.add_trace(go.Box(
            y=df_original[c].values,
            name='original',
            marker_color='darkblue',
            jitter=0.5,
            pointpos=-2,
            boxpoints='all',
            boxmean='sd')
            )
        fig.add_trace(go.Box(
            y=df_media[c].values,
            name='média',
            marker_color='coral',
            jitter=0.5,
            pointpos=-2,
            boxpoints='all',
            boxmean='sd')
            )
        fig.add_trace(go.Box(
            y=df_knn[c].values,
            name='kNN',
            marker_color='olive',
            jitter=0.5,
            pointpos=-2,
            boxpoints='all',
            boxmean='sd')
            )

        fig.update_layout(width=1500, height=1000,
                          title=dict(text="Distribuição {c}".format(c=c), font=dict(family="system-ui", size=24)))
        
        fig.write_image("./resultados/trecho_alto/aed/distribuicao_dados_{}.png".format(c))
        # fig.show()
# ============================================================================================ #
def get_telemetrica(codEstacao : str,
                    dataInicio : str,
                    dataFim : str,
                    save : bool = False) -> pd.DataFrame:
    # 1. Fazer a requisião ao servidor e pegar a árvore e a raiz dos dados 
    params = {'codEstacao':codEstacao, 'dataInicio':dataInicio, 'dataFim':dataFim}
    server = 'http://telemetriaws1.ana.gov.br/ServiceANA.asmx/DadosHidrometeorologicos'
    response = rt.get(server, params)
    tree = ET.ElementTree(ET.fromstring(response.content))
    root = tree.getroot()

    # 2. Iteração dentro dos elementos do XML procurando os dados que são disponibilizados para a estação
    list_vazao = []
    list_data = []
    list_cota = []
    list_chuva = []

    for i in root.iter('DadosHidrometereologicos'):

        data = i.find('DataHora').text
        try:
            vazao = float(i.find('Vazao').text)
        except TypeError:
            vazao = i.find('Vazao').text

        try:
            cota = float(i.find('Nivel').text)
        except TypeError:
            cota = i.find('Nivel').text

        try:
            chuva = float(i.find('Chuva').text)
        except TypeError:
            chuva = i.find('Chuva').text

        list_vazao.append(vazao)
        list_data.append(data)
        list_cota.append(cota)
        list_chuva.append(chuva)

    df = pd.DataFrame([list_data, list_cota, list_chuva, list_vazao]).transpose()
    df.columns = ['Data', 'Cota', 'Chuva', 'Vazao']
    df = df.sort_values(by='Data')
    df = df.set_index('Data')
    
    if save == True:
        df.to_excel(codEstacao+'_dados_tele.xlsx')
    
    return df
# ============================================================================================ #
def get_convencional(codEstacao : str,
                     dataInicio : str,
                     dataFim : str,
                     tipoDados : int,
                     nivelConsistencia : int,
                     save : bool = False) -> pd.DataFrame:
    """
        Série Histórica estação - HIDRO.
        codEstacao : Código Plu ou Flu
        dataInicio : <YYYY-mm-dd>
        dataFim : Caso não preenchido, trará até o último dado mais recente armazenado
        tipoDados : 1-Cotas, 2-Chuvas ou 3-Vazões
        nivelConsistencia : 1-Bruto ou 2-Consistido
    """

    # 1. Fazer a requisião ao servidor e pegar a árvore e a raiz dos dados 
    params = {'codEstacao':codEstacao, 'dataInicio':dataInicio, 'dataFim':dataFim,
              'tipoDados':tipoDados, 'nivelConsistencia':nivelConsistencia}
    
    server = 'http://telemetriaws1.ana.gov.br/ServiceANA.asmx/HidroSerieHistorica'
    response = rt.get(server, params)
    tree = ET.ElementTree(ET.fromstring(response.content))
    root = tree.getroot()
    
    # 2. Iteração dentro dos elementos do XML procurando os dados que são disponibilizados para a estação
    list_data = []
    list_consistenciaF = []
    list_month_dates = []

    for i in root.iter('SerieHistorica'):

        consistencia = i.find('NivelConsistencia').text
        date = i.find('DataHora').text
        date = datetime.strptime(date, '%Y-%m-%d %H:%M:%S')
        last_day = calendar.monthrange(date.year, date.month)[1]
        month_dates = [date + timedelta(days=i) for i in range(last_day)]
        content = []
        list_consistencia = []

        for day in range(last_day):
            if tipoDados == 1:
                value = f'Cota{day+1:02d}'
            if tipoDados == 2:
                value = f'Chuva{day+1:02d}'
            if tipoDados == 3:
                value = f'Vazao{day+1:02d}'
            
            try:
                content.append(float(i.find(value).text))
                list_consistencia.append(int(consistencia))
            except TypeError:
                content.append(i.find(value).text)
                list_consistencia.append(int(consistencia))
            except AttributeError:
                content.append(None)
                list_consistencia.append(int(consistencia))
        
        list_data += content
        list_consistenciaF += list_consistencia
        list_month_dates += month_dates
    df = pd.DataFrame([list_month_dates, list_consistenciaF, list_data]).transpose()

    if tipoDados == 1:
        df.columns = ['Data','Consistencia','Cota']
    elif tipoDados == 2:
        df.columns = ['Data','Consistencia','Chuva']
    else: # Vazão
        df.columns = ['Data','Consistencia','Vazao']
    
    df = df.sort_values(by='Data')
    df = df.set_index('Data')

    if save == True:
        df.to_excel(codEstacao + '_dados_conv.xlsx')
    
    return df
# ============================================================================================ #
def gerar_dados_tele(estacao_principal : str,
                    outras_estacoes : List[str],
                    nome_arq : str,
                    dt_inicio : str,
                    dt_fim : str,
                    salvar : bool = False) -> None:
    """
            Este método vai pegar o código da 'estacao_principal' (que o usuário já sabe previamente que é uma telemétrica), baixar os dados da estação
        e concatenar (outer join) com os dados das outras estações telemétricas. Neste método já será realizada a conversão dos dados de 'object' para
        os tipos de acordo, ou seja, 'float' para os campos numéricos e 'datetime' para os campos de datahora.
            Como o desejo do trabalho é lidar com dados diários, já aproveita pra fazer a agregação dos dados desta maneira também.
            Após tudo isso, salvar num arquivo xlsx para usos posteriores.

        Parâmetros:
            estacao_principal : str,
            outras_estacoes : List[str],
            nome_arq : str,
            dt_inicio : str = 'YYYY-mm-dd',
            dt_fim : str = 'YYYY-mm-dd',
            salvar : bool = True|False
    """

    df_result = get_telemetrica(codEstacao=estacao_principal, dataInicio=dt_inicio, dataFim=dt_fim)

    df_result.index = pd.to_datetime(df_result.index)
    df_result.Cota = pd.to_numeric(df_result.Cota, errors='coerce')
    df_result.Chuva = pd.to_numeric(df_result.Chuva, errors='coerce')
    df_result.Vazao = pd.to_numeric(df_result.Vazao, errors='coerce')

    df_result = df_result.resample('D').agg({'Cota': 'mean', 'Chuva': 'sum', 'Vazao': 'mean'})

    df_result.columns = ['t_ct_'+str(estacao_principal), 't_cv_'+str(estacao_principal), 't_vz_'+str(estacao_principal)]

    # Agora que já tenho os dados da estação que considero principal na análise (target)
    #   vou agregar com os dados das demais estações

    for e in outras_estacoes:
        df_temp = get_telemetrica(codEstacao=e, dataInicio=dt_inicio, dataFim=dt_fim)

        # Convertendo os dados
        df_temp.index = pd.to_datetime(df_temp.index)
        df_temp.Cota = pd.to_numeric(df_temp.Cota, errors='coerce')
        df_temp.Chuva = pd.to_numeric(df_temp.Chuva, errors='coerce')
        df_temp.Vazao = pd.to_numeric(df_temp.Vazao, errors='coerce')

        # Para as telemétricas já agrego aqui mesmo
        df_temp = df_temp.resample('D').agg({'Cota': 'mean', 'Chuva': 'sum', 'Vazao': 'mean'})

        # Ajeito os nomes das colunas pra conter de qual estacao os dado veio
        df_temp.columns = ['t_ct_'+e, 't_cv_'+e, 't_vz_'+e]

        df_result = pd.concat([df_result, df_temp], axis=1)

    if salvar:
        df_result.to_excel(nome_arq+'_dados_tele.xlsx')
# ============================================================================================ #
def gerar_dados_conv(estacao_principal : str,
                    outras_estacoes : List[str],
                    nome_arq : str,
                    dt_inicio : str,
                    dt_fim : str,
                    tp_dados : int,
                    nvl_consistencia : str,
                    drop_consistencia : bool = True, # Remover a coluna "NivelConsistência". Ela será irrelevante, até segunda ordem.
                    salvar : bool = False) -> None:
    """
            Este método vai pegar o código da 'estacao_principal' (que o usuário já sabe previamente que é uma convencional), baixar os dados da estação
        e concatenar (outer join) com os dados das outras estações convencionais. Neste método já será realizada a conversão dos dados de 'object' para
        os tipos de acordo, ou seja, 'float' para os campos numéricos e 'datetime' para os campos de datahora.
            Como o desejo do trabalho é lidar com dados diários, já aproveita pra fazer a agregação dos dados desta maneira também.
            Após tudo isso, salvar num arquivo xlsx para usos posteriores.

        Parâmetros:
            estacao_principal : str,
            outras_estacoes : List[str],
            nome_arq : str,
            dt_inicio : str = 'YYYY-mm-dd',
            dt_fim : str = 'YYYY-mm-dd',
            tp_dados : int (1-cota | 2-chuva | 3-vazao),
            nvl_consistencia : int (1-bruto | 2-consistido),
            drop_consistencia : bool = True, (Remover a coluna "NivelConsistência". Ela será irrelevante, até segunda ordem)
            salvar : bool = False
    """

    df_result = get_convencional(codEstacao=estacao_principal, dataInicio=dt_inicio, dataFim=dt_fim, tipoDados=tp_dados, nivelConsistencia=nvl_consistencia)

    df_result.index = pd.to_datetime(df_result.index)

    if drop_consistencia:
        df_result.drop(columns=['Consistencia'], inplace=True)

    if tp_dados == 1:
        df_result.Cota = pd.to_numeric(df_result.Cota, errors='coerce')
        df_result = df_result.resample('D').agg({'Cota': 'mean'})
        df_result.columns = ['c_ct_'+str(estacao_principal)]
    elif tp_dados == 2:
        df_result.Chuva = pd.to_numeric(df_result.Chuva, errors='coerce')
        df_result = df_result.resample('D').agg({'Chuva': 'sum'})
        df_result.columns = ['c_cv_'+str(estacao_principal)]
    else: # Vazão
        df_result.Vazao = pd.to_numeric(df_result.Vazao, errors='coerce')
        df_result = df_result.resample('D').agg({'Vazao': 'mean'})
        df_result.columns = ['c_vz_'+str(estacao_principal)]

    # Agora que já tenho os dados da estação que considero principal na análise (target)
    #   vou agregar com os dados das demais estações

    for e in outras_estacoes:
        df_temp = get_convencional(codEstacao=e, dataInicio=dt_inicio, dataFim=dt_fim, tipoDados=tp_dados, nivelConsistencia=nvl_consistencia)

        # Convertendo os dados
        df_temp.index = pd.to_datetime(df_temp.index)

        if drop_consistencia:
            df_temp.drop(columns=['Consistencia'], inplace=True)

        if tp_dados == 1:
            df_temp.Cota = pd.to_numeric(df_temp.Cota, errors='coerce')
            df_temp = df_temp.resample('D').agg({'Cota': 'mean'})
            df_temp.columns = ['c_ct_'+str(e)]
        elif tp_dados == 2:
            df_temp.Chuva = pd.to_numeric(df_temp.Chuva, errors='coerce')
            df_temp = df_temp.resample('D').agg({'Chuva': 'sum'})
            df_temp.columns = ['c_cv_'+str(e)]
        else: # Vazão
            df_temp.Vazao = pd.to_numeric(df_temp.Vazao, errors='coerce')
            df_temp = df_temp.resample('D').agg({'Vazao': 'mean'})
            df_temp.columns = ['c_vz_'+str(e)]

        df_result = pd.concat([df_result, df_temp], axis=1)

    if salvar:
        if tp_dados == 1:
            df_result.to_excel(nome_arq + '_dados_cota_conv.xlsx')
        elif tp_dados == 2:
            df_result.to_excel(nome_arq + '_dados_chuva_conv.xlsx')
        else:
            df_result.to_excel(nome_arq + '_dados_vazao_conv.xlsx')

# Download dos dados

In [3]:
# Baixando os dados das estações que serão utilizadas no trabalho
# As estações foram selecionadas a partir do sistema Data Rhama
# Aqui eu baixo os dados e salvo localmente
# >>>>>>>>>>>>> SÓ PRECISA FAZER ISSO UMA VEZ, POR ISSO O CÓDIGO FICA COMENTADO DEPOIS DE RODAR!!!! <<<<<<<<<<<<<

# estacao_principal = '56425000'
# outras_estacoes = ['56338500', '56338080', '56110005', '56337200', '56337500']

## Telemétricos

In [4]:
# Aplicando a lib HydroBR eu desejo saber se as estações em questão são do tipo convencional ou telemétrica
# O código não exclui o fato, eventual, de uma dada estação ser convencional E telemétrica, como é o caso aqui

# lista_estacoes = hbr.get_data.ANA.list_telemetric() # Vendo primeiro se tem telemétrica
# lista_estacoes.head()

In [5]:
# Averiguando se as estações que tenho em mãos estão presentes neste conjunto de estações telemétricas

# print("Estação {e} -> {p}".format(
#                                 e=estacao_principal,
#                                 p=(lista_estacoes['Code'] == estacao_principal).any()
#                             )
#     )

# A estação principal tem dados telemétricos

In [6]:
# Verificando as outras estações

# for e in outras_estacoes:
#     print("Estação {e} -> {p}".format(
#                                         e=e,
#                                         p=(lista_estacoes['Code'] == e).any()
#                                     )
#     )

# Estas estações também têm dados telemétricos

In [7]:
# Gerando um arquivo Excel com os dados das estações telemétricas

# gerar_dados_tele(estacao_principal=estacao_principal,
#                  outras_estacoes=outras_estacoes,
#                  nome_arq="alto_rio_doce",
#                  dt_inicio='2013-01-01',
#                  dt_fim='2023-12-31',
#                  salvar=True)

## Convencionais

### Cota/Vazão

In [8]:
# Aplicando a lib HydroBR eu desejo saber se as estações em questão são do tipo convencional ou telemétrica
# O código não exclui o fato, eventual, de uma dada estação ser convencional E telemétrica, como é o caso aqui

# lista_estacoes = hbr.get_data.ANA.list_flow(state='MINAS GERAIS', source='ANA') # Verificando se tem estações de cota/vazão primeiro
# lista_estacoes.head()

In [9]:
# Averiguando se as estações que tenho em mãos estão presentes neste conjunto de estações convencionais de cota/vazão

# print("Estação {e} -> {p}".format(
#                                 e=estacao_principal,
#                                 p=(lista_estacoes['Code'] == estacao_principal).any()
#                             )
#     )

# A estação principal tem dados convencionais de cota/vazão

In [10]:
# Verificando as outras estações

# for e in outras_estacoes:
#     print("Estação {e} -> {p}".format(
#                                         e=e,
#                                         p=(lista_estacoes['Code'] == e).any()
#                                     )
#     )

# Estas estações também têm dados convencionais de cota/vazão

In [11]:
# Gerando um arquivo Excel com os dados das estações convencionais

# gerar_dados_conv(estacao_principal=estacao_principal,
#                 outras_estacoes=outras_estacoes,
#                 nome_arq="alto_rio_doce",
#                 dt_inicio='2013-01-01',
#                 dt_fim='2023-12-31',
#                 tp_dados=1, # Cota
#                 nvl_consistencia='2', # dados consistidos
#                 salvar=True)

In [12]:
# Gerando um arquivo Excel com os dados das estações convencionais

# gerar_dados_conv(estacao_principal=estacao_principal,
#                 outras_estacoes=outras_estacoes,
#                 nome_arq="alto_rio_doce",
#                 dt_inicio='2013-01-01',
#                 dt_fim='2023-12-31',
#                 tp_dados=3, # Vazão
#                 nvl_consistencia='2', # dados consistidos
#                 salvar=True)

### Chuva

In [13]:
# lista_estacoes = hbr.get_data.ANA.list_prec(state='MINAS GERAIS', source='ANA')
# lista_estacoes.head()

In [14]:
# print("Estação {e} -> {p}".format(
#                                 e=estacao_principal,
#                                 p=(lista_estacoes['Code'] == estacao_principal).any()
#                             )
#     )

# A estação principal NÃO tem dados convencionais de chuva

In [15]:
# Verificando as outras estações

# for e in outras_estacoes:
#     print("Estação {e} -> {p}".format(
#                                         e=e,
#                                         p=(lista_estacoes['Code'] == e).any()
#                                     )
#     )

# Estas estações também NÃO têm dados convencionais de chuva

Já tenho todos os dados de que preciso. Estão salvos localmente, agora é carregar em memória, juntar tudo e mandar ver nos experimentos.

## Juntando os dados

In [16]:
# Como estou em dúvida com o arquivo de cotas, deixei apenas estes dois arquivos pra trabalhar.

# arquivos = ['alto_rio_doce_dados_tele.xlsx', 'alto_rio_doce_dados_vazao_conv.xlsx']

In [17]:
# Vou fazer a carga primeiro dos dados telemétricos, porque é onde tem mais informação de uma única vez.
# Depois concateno os outros arquivos. Mas a ordem tanto faz aqui, só estipulei assim porque acho melhor

# df = pd.read_excel(arquivos[0], sheet_name=0, index_col=0, header=0, parse_dates=['Data'])

# for a in range(1, len(arquivos)):
#     df_temp = pd.read_excel(arquivos[a], sheet_name=0, index_col=0, header=0, parse_dates=['Data'])
#     df = pd.concat([df, df_temp], axis=1)

# df

In [18]:
# df.columns

In [19]:
# Vou remover as colunas das cotas

# df.drop(columns=['t_ct_56425000', 't_ct_56338500', 't_ct_56338080', 't_ct_56110005', 't_ct_56337200', 't_ct_56337500'], inplace=True)
# df

In [20]:
# df.columns

In [21]:
# Fazendo o merge das colunas de vazão que são correspondentes à mesma estação
# Acontece que existem gaps entre os dados, o que é estranho, porque a estação telemétrica tem dados que a convencional não tem
# E vice-versa. Vou contar qual coluna tem mais dados e depois executar um 'fillna'
# colunas_esquerda = ['t_vz_56425000', 't_vz_56338500', 't_vz_56338080', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500']
# colunas_direita = ['c_vz_56425000', 'c_vz_56338500', 'c_vz_56338080', 'c_vz_56110005', 'c_vz_56337200', 'c_vz_56337500']

# for i, j in zip(colunas_esquerda, colunas_direita):
#     print(i, j, df[i].isna().sum(), df[j].isna().sum())

# A coluna que tiver menos buracos será preenchida com os dados da coluna que tem mais dados faltantes

In [22]:
# Preenchendo a coluna que tem menos dados faltantes com a outra correspondente
# Fazer isso para cada coluna, contudo, tem colunas que tem dados faltando demais. Neste caso, darei drop nelas inteiramente.
# df['c_vz_56425000'].fillna(df['t_vz_56425000'], inplace=True)
# df['t_vz_56110005'].fillna(df['c_vz_56110005'], inplace=True)

# df['c_vz_56425000'].isna().sum(), df['t_vz_56110005'].isna().sum()

In [23]:
# Uma vez que as colunas estejam ajustadas, eu dropo as que não vou precisar mais
# df = df.drop(columns=['t_vz_56425000', 'c_vz_56110005'])
# df

In [24]:
# As colunas que, nesta análise, resolvi remover também porque não farão diferença pro trabalho
# df = df.drop(columns=['c_vz_56338500', 't_vz_56338080', 'c_vz_56338080', 'c_vz_56337200', 'c_vz_56337500'])
# df

In [25]:
# Deixando os dados contínuos, numa base diária.
# df = df.resample('D').first()
# df

## Exportando os dados finais

In [26]:
# Neste momento, tenho o DataFrame com os dados EXATAMENTE da forma que preciso.
# Posso, inclusive, exportar isso para um arquivo de Excel
# É o que farei, pois se precisar retornar aos dados originais, será mais fácil que fazer toda engenharia até aqui

# df.to_excel('alto_rio_doce_final.xlsx')

# Carregando e imputando dados

In [27]:
df = pd.read_excel('alto_rio_doce_final.xlsx', sheet_name=0, index_col=0, header=0, parse_dates=['Data'])

In [28]:
# Só reordenando a posição das colunas pra ficar mais fácil de ler e entender
df = df[['c_vz_56425000', 't_cv_56425000', 't_cv_56338500', 't_cv_56338080', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500', 't_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500']]

# Deixando o DataFrame no padrão que a lib MLForecast entende
df['unique_id'] = 1
df = df.reset_index()
df = df.rename(columns={'Data' : 'ds',
                        'c_vz_56425000' : 'y'})

df

Unnamed: 0,ds,y,t_cv_56425000,t_cv_56338500,t_cv_56338080,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500,unique_id
0,2013-01-01,82.787100,,,,0.0,,,,60.056100,,,1
1,2013-01-02,80.489300,,,,0.0,,,,46.950000,,,1
2,2013-01-03,78.214200,,,,0.0,,,,46.703125,,,1
3,2013-01-04,79.348900,,,,0.0,,,,49.239583,,,1
4,2013-01-05,129.161000,,,,0.0,,,,49.780208,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,88.370937,8.2,0.2,0.0,0.0,2.2,8.6,99.451667,70.568804,0.610000,28.053750,1
4013,2023-12-28,87.519375,2.0,1.2,0.0,0.0,0.0,0.8,93.551667,60.014457,33.753333,21.705417,1
4014,2023-12-29,76.317813,0.0,0.0,0.0,0.0,0.0,0.0,77.745652,49.910909,61.942917,21.022917,1
4015,2023-12-30,67.118542,14.8,3.4,0.0,3.0,4.8,19.8,70.258261,47.746848,58.080000,22.252500,1


In [29]:
# Percentual de dados faltantes, por coluna

# print(100*df.drop(columns=['ds', 'unique_id']).isna().sum() / len(df))

##### Preenchendo com a média

In [30]:
# df_media = df.fillna(df.mean())
# df_media

##### Preenchendo com o KNNImputer

In [31]:
# Recomendam aplicar um scaling antes de imputar com o KNNImputer, mas nos testes que realizei deu nenhuma diferença nos resultados
# Então vou reduzir a engenharia de programação e não usar scaling

imputer = KNNImputer(n_neighbors=7, weights='distance')
df_knn = pd.DataFrame(imputer.fit_transform(df.drop(columns=['ds', 'unique_id'])), columns=df.drop(columns=['ds', 'unique_id']).columns)
df_knn = pd.DataFrame(df_knn, columns=df.drop(columns=['ds', 'unique_id']).columns)
df_knn = pd.concat([df[['ds', 'unique_id']], df_knn], axis=1)
df_knn

Unnamed: 0,ds,unique_id,y,t_cv_56425000,t_cv_56338500,t_cv_56338080,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
0,2013-01-01,1,82.787100,0.241148,0.202105,0.0,0.0,0.437702,0.551181,79.298366,60.056100,49.154342,29.104130
1,2013-01-02,1,80.489300,0.174561,0.024937,0.0,0.0,0.168651,0.000000,80.660440,46.950000,34.521715,41.121955
2,2013-01-03,1,78.214200,0.588994,0.848283,0.0,0.0,0.322936,0.566388,82.201442,46.703125,38.586211,51.963603
3,2013-01-04,1,79.348900,0.317681,0.425111,0.0,0.0,0.093011,0.288712,76.099995,49.239583,48.206779,17.611111
4,2013-01-05,1,129.161000,2.034343,0.869555,0.0,0.0,1.522630,1.933175,108.219680,49.780208,63.708133,22.068745
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,1,88.370937,8.200000,0.200000,0.0,0.0,2.200000,8.600000,99.451667,70.568804,0.610000,28.053750
4013,2023-12-28,1,87.519375,2.000000,1.200000,0.0,0.0,0.000000,0.800000,93.551667,60.014457,33.753333,21.705417
4014,2023-12-29,1,76.317813,0.000000,0.000000,0.0,0.0,0.000000,0.000000,77.745652,49.910909,61.942917,21.022917
4015,2023-12-30,1,67.118542,14.800000,3.400000,0.0,3.0,4.800000,19.800000,70.258261,47.746848,58.080000,22.252500


##### Distribuição comparada

In [32]:
# distribuicao_dados(df_original=df, df_media=df_media, df_knn=df_knn)

Vou utilizar os dados advindos do KNNImputer. Os dados ficaram melhor distribuídos utilizando essa técnica.

Aproveito para remover também a coluna 't_cv_56338080'. A distribuição dos dados nesta coluna continua muito ruim.

In [33]:
df_knn = df_knn.drop(columns=['t_cv_56338080'])

##### Separando dados para 'X' e 'y'

Não sei se vai ser necessário usá-los, mas já deixo aqui pra caso precise

In [34]:
df_knn.columns

Index(['ds', 'unique_id', 'y', 't_cv_56425000', 't_cv_56338500',
       't_cv_56110005', 't_cv_56337200', 't_cv_56337500', 't_vz_56338500',
       't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
      dtype='object')

In [35]:
df_X = df_knn.drop(columns=['y'])
df_y = df_knn[['ds', 'y', 'unique_id']]

In [36]:
df_X

Unnamed: 0,ds,unique_id,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
0,2013-01-01,1,0.241148,0.202105,0.0,0.437702,0.551181,79.298366,60.056100,49.154342,29.104130
1,2013-01-02,1,0.174561,0.024937,0.0,0.168651,0.000000,80.660440,46.950000,34.521715,41.121955
2,2013-01-03,1,0.588994,0.848283,0.0,0.322936,0.566388,82.201442,46.703125,38.586211,51.963603
3,2013-01-04,1,0.317681,0.425111,0.0,0.093011,0.288712,76.099995,49.239583,48.206779,17.611111
4,2013-01-05,1,2.034343,0.869555,0.0,1.522630,1.933175,108.219680,49.780208,63.708133,22.068745
...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,1,8.200000,0.200000,0.0,2.200000,8.600000,99.451667,70.568804,0.610000,28.053750
4013,2023-12-28,1,2.000000,1.200000,0.0,0.000000,0.800000,93.551667,60.014457,33.753333,21.705417
4014,2023-12-29,1,0.000000,0.000000,0.0,0.000000,0.000000,77.745652,49.910909,61.942917,21.022917
4015,2023-12-30,1,14.800000,3.400000,3.0,4.800000,19.800000,70.258261,47.746848,58.080000,22.252500


In [37]:
df_y

Unnamed: 0,ds,y,unique_id
0,2013-01-01,82.787100,1
1,2013-01-02,80.489300,1
2,2013-01-03,78.214200,1
3,2013-01-04,79.348900,1
4,2013-01-05,129.161000,1
...,...,...,...
4012,2023-12-27,88.370937,1
4013,2023-12-28,87.519375,1
4014,2023-12-29,76.317813,1
4015,2023-12-30,67.118542,1


## Análise exploratória dos dados

### Decomposição das Séries Temporais

A decomposição das séries temporais ajuda a detectar padrões (tendência, sazonalidade) e identificar outras informações que podem ajudar na interpretação do que está acontecendo.

Executei a tarefa no atributo "df" pois isso me garante que estou tratando dos dados originais, sem alteração nenhuma, vindos do arquivo CSV.

In [38]:
decomp_series(df=df_knn)

### Estacionariedade

In [39]:
estacionariedade(df=df_knn, sp=365)

y True
y [365]
t_cv_56425000 True
t_cv_56425000 [365]
t_cv_56338500 True
t_cv_56338500 [365]
t_cv_56110005 True
t_cv_56110005 [365]
t_cv_56337200 True
t_cv_56337200 [365]
t_cv_56337500 True
t_cv_56337500 [365]
t_vz_56338500 True
t_vz_56338500 [365]
t_vz_56110005 True
t_vz_56110005 [365]
t_vz_56337200 True
t_vz_56337200 [365]
t_vz_56337500 True
t_vz_56337500 []


A série 't_vz_56337500' é estacionária, contudo, na lag 365 ela não apresenta sazonalidade.

### Correlação entre as séries

In [40]:
mapa_correlacao(df=df_knn)

In [41]:
# Usando o sweetviz para avaliar
# import sweetviz as sv
# analyze_report = sv.analyze(df_knn)
# analyze_report.show_html('analyze.html', open_browser=True)

# Apresentando os resultados (serve apenas para usar no Google Colab)
# import IPython
# IPython.display.HTML('analyze.html')

In [42]:
# Preferi jogar os dados alterados para um novo DataFrame porque se precisar voltar no DataFrame inicial, não precisará regarregar o arquivo
df_aux = df_knn.copy()

In [43]:
df_aux.drop(columns=['ds', 'unique_id']).describe()

Unnamed: 0,y,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
count,4017.0,4017.0,4017.0,4017.0,4017.0,4017.0,4017.0,4017.0,4017.0,4017.0
mean,118.023065,2.991348,2.596121,2.870451,2.30357,1.996776,103.375251,68.911155,66.2024,26.261579
std,120.338238,8.676786,8.403539,9.691323,7.527579,6.136959,112.261359,64.78097,61.985264,35.889518
min,21.3583,0.0,0.0,0.0,0.0,0.0,21.195,9.619672,0.0,0.0
25%,58.0981,0.0,0.0,0.0,0.0,0.0,50.425,31.555208,29.016667,4.688275
50%,82.4414,0.017697,0.0,0.0,0.0,0.0,78.9625,50.025937,51.087384,12.270833
75%,133.011,1.239609,1.074228,0.2,0.822984,0.6,115.950208,77.045208,82.9875,31.631458
max,2000.47,172.4,181.2,197.0,110.9,97.4,1936.28,819.429375,928.774167,258.6375


In [44]:
mapa_correlacao(df=df_aux)

### Análise de Autocorrelação

In [45]:
# Me interessa saber a sazonalidade da variável-alvo, a vazão
cria_plot_correlacao(serie=df_aux.y, n_lags=90, plot_pacf=False)

# É possível plotar para mais lags, mas aí o gráfico fico horroroso demais!!!

In [46]:
cria_plot_correlacao(serie=df_aux['y'], n_lags=90, plot_pacf=True)

### Gerando os gráficos das features em contraste com a vazão y (target).

Gerando os gráficos de vazão em conjunto com a vazão y (target) e desta com as chuvas também.

Minha intenção aqui é verificar, visualmente, as influências que eventualmente possam ter, de acordo com o período do ano.<br/>
Não é, digamos, muito científico, mas ajuda a compreender o funcionamento do comportamento das séries temporais.

In [47]:
fig_vazoes = make_subplots(rows=2, cols=1, subplot_titles=("variável endógena (vazão)", "variáveis exógenas (vazão)"))

fig_vazoes.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['y'], name='vazao_y', mode='lines', showlegend=True, line=dict(color="#000000", width=2)), row=1, col=1)
fig_vazoes.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_vz_56338500'], name='t_vz_56338500', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_vazoes.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_vz_56110005'], name='t_vz_56110005', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_vazoes.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_vz_56337200'], name='t_vz_56337200', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_vazoes.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_vz_56337500'], name='t_vz_56337500', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)

fig_vazoes.update_yaxes(title=dict(text="m³/s", font=dict(family="system-ui", size=18)), row=1, col=1)
fig_vazoes.update_yaxes(title=dict(text="m³/s", font=dict(family="system-ui", size=18)), row=2, col=1)

fig_vazoes.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)), row=1, col=1)
fig_vazoes.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)), row=2, col=1)

fig_vazoes.update_layout(width=1500, height=1000,
                         title=dict(text="Vazões", font=dict(family="system-ui", size=24)))

fig_vazoes.write_image("./resultados/trecho_alto/aed/target_com_vazoes.png")
# fig_vazoes.show()

##########

fig_chuvas = make_subplots(rows=2, cols=1, subplot_titles=("variável endógena (vazão)", "variáveis exógenas (chuva)"))

fig_chuvas.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['y'], name='vazao_y', mode='lines', showlegend=True, line=dict(color="#000000", width=2)), row=1, col=1)
fig_chuvas.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_cv_56425000'], name='t_cv_56425000', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_chuvas.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_cv_56338500'], name='t_cv_56338500', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_chuvas.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_cv_56110005'], name='t_cv_56110005', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_chuvas.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_cv_56337200'], name='t_cv_56337200', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)
fig_chuvas.add_trace(go.Scatter(x=df_aux['ds'], y=df_aux['t_cv_56337500'], name='t_cv_56337500', mode='lines', showlegend=True, line=dict(width=1)), row=2, col=1)

fig_chuvas.update_yaxes(title=dict(text="m³/s", font=dict(family="system-ui", size=18)), row=1, col=1)
fig_chuvas.update_yaxes(title=dict(text="mm/dia", font=dict(family="system-ui", size=18)), row=2, col=1)

fig_chuvas.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)), row=1, col=1)
fig_chuvas.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)), row=2, col=1)

fig_chuvas.update_layout(width=1500, height=1000,
                         title=dict(text="Chuvas", font=dict(family="system-ui", size=24)))

fig_chuvas.write_image("./resultados/trecho_alto/aed/target_com_chuvas.png")
# fig_chuvas.show()

### Análise de delay

In [48]:
# PRECISA SER SÉRIES NA MESMA ESCALA
# ISSO NÃO VAI FUNCIONAR DO JEITO QUE ESTOU PENSANDO

# from scipy.spatial.distance import euclidean
# from fastdtw import fastdtw

# # Calcula a distância dinâmica entre as séries
# distance, path = fastdtw(df_aux.y, df_aux.chuva, dist=euclidean)

# print(f"Distância dinâmica entre as séries: {distance}")

### Separação dos dados

In [49]:
df_train, df_test = temporal_train_test_split(df_aux, test_size=0.2, anchor="start")

In [50]:
df_train

Unnamed: 0,ds,unique_id,y,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
0,2013-01-01,1,82.7871,0.241148,0.202105,0.0,0.437702,0.551181,79.298366,60.056100,49.154342,29.104130
1,2013-01-02,1,80.4893,0.174561,0.024937,0.0,0.168651,0.000000,80.660440,46.950000,34.521715,41.121955
2,2013-01-03,1,78.2142,0.588994,0.848283,0.0,0.322936,0.566388,82.201442,46.703125,38.586211,51.963603
3,2013-01-04,1,79.3489,0.317681,0.425111,0.0,0.093011,0.288712,76.099995,49.239583,48.206779,17.611111
4,2013-01-05,1,129.1610,2.034343,0.869555,0.0,1.522630,1.933175,108.219680,49.780208,63.708133,22.068745
...,...,...,...,...,...,...,...,...,...,...,...,...
3208,2021-10-14,1,296.9940,0.000000,0.000000,0.0,0.000000,0.000000,136.637391,205.243929,152.212917,0.019167
3209,2021-10-15,1,196.4810,1.200000,0.200000,0.0,0.000000,27.000000,133.948333,115.983158,88.515417,0.243415
3210,2021-10-16,1,153.3710,3.200000,0.000000,0.0,0.000000,0.400000,115.950208,98.104000,63.722083,2.511458
3211,2021-10-17,1,132.3440,0.400000,0.200000,0.2,11.400000,59.400000,103.382500,76.855357,56.900000,0.674255


In [51]:
df_test

Unnamed: 0,ds,unique_id,y,t_cv_56425000,t_cv_56338500,t_cv_56110005,t_cv_56337200,t_cv_56337500,t_vz_56338500,t_vz_56110005,t_vz_56337200,t_vz_56337500
3213,2021-10-19,1,169.278000,65.6,0.2,0.0,32.0,10.0,139.491875,88.814000,72.926667,84.633333
3214,2021-10-20,1,231.629000,17.6,0.0,3.8,19.6,12.8,143.208958,132.886857,79.814583,83.641875
3215,2021-10-21,1,251.648000,1.8,0.2,0.0,0.6,0.0,145.313750,127.408229,93.158333,89.416667
3216,2021-10-22,1,226.717000,0.2,0.0,0.0,0.0,0.0,137.791250,136.881383,101.487500,11.988542
3217,2021-10-23,1,179.687000,0.0,0.2,0.0,0.0,0.0,123.374375,112.325937,84.465000,1.180208
...,...,...,...,...,...,...,...,...,...,...,...,...
4012,2023-12-27,1,88.370937,8.2,0.2,0.0,2.2,8.6,99.451667,70.568804,0.610000,28.053750
4013,2023-12-28,1,87.519375,2.0,1.2,0.0,0.0,0.8,93.551667,60.014457,33.753333,21.705417
4014,2023-12-29,1,76.317813,0.0,0.0,0.0,0.0,0.0,77.745652,49.910909,61.942917,21.022917
4015,2023-12-30,1,67.118542,14.8,3.4,3.0,4.8,19.8,70.258261,47.746848,58.080000,22.252500


In [52]:
# Só precisa apresentar o gráfico para a coluna alvo, a vazão y.

fig = go.Figure()

fig.add_trace(go.Scatter(x=df_train['ds'], y=df_train['y'], mode='lines', name='treino'))
fig.add_trace(go.Scatter(x=df_test['ds'], y=df_test['y'], mode='lines', name='teste'))

fig.update_yaxes(title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)))
fig.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))

fig.update_layout(width=1500, height=700, hovermode="x unified",
                  title=dict(text="Vazão 'y' (target)", font=dict(family="system-ui", size=24)))

fig.write_image("./resultados/trecho_alto/aed/separacao_dados.png")
# fig.show()

# Variáveis globais

Estas variáveis serão utilizadas tanto pelos modelos de ML quanto pelas Redes Neurais

In [53]:
look_back = 7 # Lags a serem utilizadas.
fch_v = [3, 5, 7, 10, 15, 30, 60, 90] # Horizonte de Previsão (como a frequência dos dados é diária, isso significa "fch" dias)

# MLForecast

#### Vazões exógenas calculadas no fch

##### Não otimizados

In [54]:
models = [LGBMRegressor(random_state=5), # usando 'gbdt' - Gradient Boosting Decision Tree
          LinearRegression(),
          LinearSVR(random_state=5)]

fcst = mlf.MLForecast(models=models, freq='D',
                      lags=[i+1 for i in range(look_back)],
                      date_features=['year', 'month', 'quarter', 'dayofyear', 'week'])

fcst.fit(df_train, static_features=[])

# Mostrando que as features que serão usadas de input são as que realmente escolhi usar.
# fcst.ts.features_order_

for f in fch_v:
    df_test_futr = cria_dataframe_futuro(df_futr=fcst.make_future_dataframe(h=f),
                                        df_train=df_train,
                                        df_test=df_test,
                                        tp_valor='ml',
                                        n_lags=look_back,
                                        cols=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'])
    
    df_preds = fcst.predict(h=f, X_df=df_test_futr)
    df_joined = pd.merge(left=df_preds, right=df_test[['ds', 'y']], on=['ds'], how='left')

    metrics = {}
    metrics['LGBMRegressor'] = {'sMAPE': smape(df_joined.y, df_joined.LGBMRegressor),
                                'RMSE': rmse(df_joined.y, df_joined.LGBMRegressor),
                                'MAE' : mae(df_joined.y, df_joined.LGBMRegressor)}
    metrics['LinearRegression'] = {'sMAPE': smape(df_joined.y, df_joined.LinearRegression),
                                   'RMSE': rmse(df_joined.y, df_joined.LinearRegression),
                                   'MAE' : mae(df_joined.y, df_joined.LinearRegression)}
    metrics['LinearSVR'] = {'sMAPE': smape(df_joined.y, df_joined.LinearSVR),
                            'RMSE': rmse(df_joined.y, df_joined.LinearSVR),
                            'MAE' : mae(df_joined.y, df_joined.LinearSVR)}
    
    df_tbl_v = pd.DataFrame(metrics).T.reset_index(names="Modelo")

    # ============================================================================ #

    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, specs=[[{"type": "scatter"}], [{"type": "table"}]])

    fig.add_trace(go.Scatter(x=df_joined.ds, y=df_joined.y, mode='lines', name='observado', line=dict(color="black", width=4)), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_joined.ds, y=df_joined.LGBMRegressor, mode='lines', name='LGBM', line=dict(color="red")), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_joined.ds, y=df_joined.LinearRegression, mode='lines', name='LR', line=dict(color="darkviolet")), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_joined.ds, y=df_joined.LinearSVR, mode='lines', name='LinearSVR', line=dict(color="green")), row=1, col=1)
    fig.append_trace(go.Table(header=dict(values=df_tbl_v.columns.to_list(), font=dict(size=14), align="center"),
                                    cells=dict(values=df_tbl_v.T, font=dict(size=14), height=24, align="left")),
                            row=2, col=1)

    fig.update_yaxes(title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)))
    fig.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))
    
    fig.update_traces(hovertemplate=None, row=1, col=1)

    fig.update_layout(width=1500, height=1000, hovermode='x unified',
                                 title=dict(text="Modelos de ML não otimizados (fch = {f})".format(f=f),
                                            font=dict(family="system-ui", size=24)))
    
    fig.write_image("./resultados/trecho_alto/fch{fh}/naoopt/ml.png".format(fh=f))
    # fig.show()

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4275
[LightGBM] [Info] Number of data points in the train set: 3206, number of used features: 21
[LightGBM] [Info] Start training from score 101.885109


##### Otimizados

In [55]:
def opt_lgbm(trial, fh):

    # Parâmetros para o LGBMRegressor
    params = {
        'num_leaves' : trial.suggest_int('num_leaves', 4, 256),
        'n_estimators' : trial.suggest_int('n_estimators', 1, 100),
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.001, 0.5),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 1, 50),
        'bagging_fraction' : trial.suggest_loguniform('bagging_fraction', 0.01, 0.99),
        'colsample_bytree' : trial.suggest_loguniform('colsample_bytree', 0.01, 0.99)
    }

    # Parâmetro para o Forecaster
    n_lags = trial.suggest_int('n_lags', 1, look_back, step=1)

    modelo = [LGBMRegressor(verbosity=-1, bagging_freq=1, random_state=5, **params)]
    
    fcst = mlf.MLForecast(models=modelo, freq='D',
                               lags=[i+1 for i in range(n_lags)],
                               date_features=['year', 'month', 'quarter', 'dayofyear', 'week'])

    fcst.fit(df_train, id_col='unique_id', time_col='ds', target_col='y', static_features=[])

    _df_futr = cria_dataframe_futuro(df_futr=fcst.make_future_dataframe(h=fh),
                                    df_train=df_train,
                                    df_test=df_test,
                                    tp_valor='ml',
                                    n_lags=look_back,
                                    cols=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'])
    
    p = fcst.predict(h=fh, X_df=_df_futr)
    df_result = pd.merge(left=p, right=df_test[['ds', 'y']], on=['ds'], how='left')

    loss = smape(df_result['y'], df_result['LGBMRegressor'])
    
    return loss

def opt_lsvr(trial, fh):

    # Parâmetros para o LinearSVR
    params = {
        'loss' : trial.suggest_categorical('loss', ['epsilon_insensitive', 'squared_epsilon_insensitive']),
        'intercept_scaling' : trial.suggest_loguniform('intercept_scaling', 0.00001, 2.0),
        'tol' : trial.suggest_loguniform('tol', 0.00001, 2.0),
        'C' : trial.suggest_loguniform('C', 0.00001, 2.0),
        'epsilon' : trial.suggest_loguniform('epsilon', 0.00001, 2.0)
    }

    # Parâmetro para o Forecaster
    n_lags = trial.suggest_int('n_lags', 1, look_back, step=1)

    model = [LinearSVR(random_state=5, **params)]

    fcst = mlf.MLForecast(models=model, freq='D',
                            lags=[i+1 for i in range(n_lags)],
                            date_features=['year', 'month', 'quarter', 'dayofyear', 'week'])

    fcst.fit(df_train, id_col='unique_id', time_col='ds', target_col='y', static_features=[])

    _df_futr = cria_dataframe_futuro(df_futr=fcst.make_future_dataframe(h=fh),
                                    df_train=df_train,
                                    df_test=df_test,
                                    tp_valor='ml',
                                    n_lags=look_back,
                                    cols=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'])

    p = fcst.predict(h=fh, X_df=_df_futr)
    df_result = pd.merge(left=p, right=df_test[['ds', 'y']], on=['ds'], how='left')

    loss = smape(df_result['y'], df_result['LinearSVR'])

    return loss

############################

# Guardar os parâmetros apenas das melhores trials
lgbm_best_trial = {}
lsvr_best_trial = {}

for f in fch_v:
    study_lgbm = opt.create_study(direction='minimize', sampler=opt.samplers.TPESampler(seed=5))
    study_lsvr = opt.create_study(direction='minimize', sampler=opt.samplers.TPESampler(seed=5))

    opt_lgbm = partial(opt_lgbm, fh=f)
    study_lgbm.optimize(opt_lgbm, n_trials=100, timeout=1000, catch=(FloatingPointError, ValueError, ))

    opt_lsvr = partial(opt_lsvr, fh=f)
    study_lsvr.optimize(opt_lsvr, n_trials=100, timeout=1000, catch=(FloatingPointError, ValueError, ))

    lgbm_best_trial[fch_v.index(f)] = {'modelo' : 'LGBM',
                                    'fch' : f,
                                    'best_value' : study_lgbm.best_value,
                                    'best_params' : study_lgbm.best_params}
    
    lsvr_best_trial[fch_v.index(f)] = {'modelo' : 'LinearSVR',
                                        'fch' : f,
                                        'best_value' : study_lsvr.best_value,
                                        'best_params' : study_lsvr.best_params}
  
# Reproduzindo os modelos
for f, i, _ in zip(fch_v, lgbm_best_trial, lsvr_best_trial):
    m_lgbm = [LGBMRegressor(verbosity=-1, bagging_freq=1, random_state=5,
                            n_estimators=lgbm_best_trial[i]['best_params']['n_estimators'],
                            learning_rate=lgbm_best_trial[i]['best_params']['learning_rate'],
                            num_leaves=lgbm_best_trial[i]['best_params']['num_leaves'],
                            min_data_in_leaf=lgbm_best_trial[i]['best_params']['min_data_in_leaf'],
                            bagging_fraction=lgbm_best_trial[i]['best_params']['bagging_fraction'],
                            colsample_bytree=lgbm_best_trial[i]['best_params']['colsample_bytree'])]

    fcst_lgbm = mlf.MLForecast(models=m_lgbm, freq='D',
                                lags=[i+1 for i in range(lgbm_best_trial[i]['best_params']['n_lags'])],
                                date_features=['year', 'month', 'quarter', 'dayofyear', 'week'])

    fcst_lgbm.fit(df_train, id_col='unique_id', time_col='ds', target_col='y', static_features=[])

    df_futr_gbm = cria_dataframe_futuro(df_futr=fcst_lgbm.make_future_dataframe(h=f),
                                        df_train=df_train,
                                        df_test=df_test,
                                        tp_valor='ml',
                                        n_lags=look_back,
                                        cols=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'])

    p = fcst_lgbm.predict(h=f, X_df=df_futr_gbm)
    df_merged = pd.merge(left=p, right=df_test[['ds', 'y']], on=['ds'], how='left')

    # ##################################################### #

    m_lsvr = [LinearSVR(random_state=5,
                    loss=lsvr_best_trial[i]['best_params']['loss'],
                    intercept_scaling=lsvr_best_trial[i]['best_params']['intercept_scaling'],
                    tol=lsvr_best_trial[i]['best_params']['tol'],
                    C=lsvr_best_trial[i]['best_params']['C'],
                    epsilon=lsvr_best_trial[i]['best_params']['epsilon'])]

    fcst_lsvr = mlf.MLForecast(models=m_lsvr, freq='D',
                            lags=[i+1 for i in range(lsvr_best_trial[i]['best_params']['n_lags'])],
                            date_features=['year', 'month', 'quarter', 'dayofyear', 'week'])

    fcst_lsvr.fit(df_train, id_col='unique_id', time_col='ds', target_col='y', static_features=[])

    df_futr_svr = cria_dataframe_futuro(df_futr=fcst_lsvr.make_future_dataframe(h=f),
                                        df_train=df_train,
                                        df_test=df_test,
                                        tp_valor='ml',
                                        n_lags=look_back,
                                        cols=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'])

    p = fcst_lsvr.predict(h=f, X_df=df_futr_svr)
    df_merged = pd.merge(left=p, right=df_merged, on=['ds'], how='left')

    # ##################################################### #

    metrics = {}
    metrics['LGBMRegressor'] = {'sMAPE': smape(df_merged.y, df_merged.LGBMRegressor),
                                'RMSE': rmse(df_merged.y, df_merged.LGBMRegressor),
                                'MAE' : mae(df_merged.y, df_merged.LGBMRegressor)}
    metrics['LinearSVR'] = {'sMAPE': smape(df_merged.y, df_merged.LinearSVR),
                            'RMSE': rmse(df_merged.y, df_merged.LinearSVR),
                            'MAE' : mae(df_merged.y, df_merged.LinearSVR)}

    df_tbl = pd.DataFrame(metrics).T.reset_index(names="Modelo") # Usado para preencher a tabela com as métricas

    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, specs=[[{"type": "scatter"}], [{"type": "table"}]])

    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.y, mode='lines', name='observado', line=dict(color="black", width=4)), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.LGBMRegressor, mode='lines', name='LGBM', line=dict(color="red")), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.LinearSVR, mode='lines', name='LinearSVR', line=dict(color="green")), row=1, col=1)
    
    fig.append_trace(go.Table(header=dict(values=df_tbl.columns.to_list(), font=dict(size=14), align="center"),
                                cells=dict(values=df_tbl.T, font=dict(size=14), height=24, align="left")),
                            row=2, col=1)

    fig.update_yaxes(title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)))
    fig.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))

    fig.update_traces(hovertemplate=None, row=1, col=1)

    fig.update_layout(width=1500, height=1000, hovermode='x unified',
                            title=dict(text="Modelos de ML otimizados (fch = {f})".format(f=f),
                                    font=dict(family="system-ui", size=24)))

    fig.write_image("./resultados/trecho_alto/fch{fh}/opt/ml.png".format(fh=f))
    # fig.show()

# Redes Neurais

LSTM (RNN) e NBEATSx (MLP)

In [56]:
# Este DataFrame será utilizado por ambas as redes
df_futr = df_test[['ds', 'unique_id', 't_cv_56425000', 't_cv_56338500', 't_cv_56110005','t_cv_56337200', 't_cv_56337500']]

#### Não otimizado

In [57]:
for f in fch_v:
    modelos = [
        LSTM(random_seed=5, h=f, #max_steps=100,
            hist_exog_list=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
            futr_exog_list=['t_cv_56425000', 't_cv_56338500', 't_cv_56110005','t_cv_56337200', 't_cv_56337500'],
            context_size=look_back,
            scaler_type=None,
            logger=False),

        NBEATSx(random_seed=5, h=f, #max_steps=100,
            hist_exog_list=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
            futr_exog_list=['t_cv_56425000', 't_cv_56338500', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500'],
            input_size=look_back,
            scaler_type=None,
            logger=False),
        ]

    nf = NeuralForecast(models=modelos, freq='D', local_scaler_type='minmax')
    nf.fit(df=df_train)

    df_preds = nf.predict(futr_df=df_futr)
    df_merged = pd.merge(left=df_preds, right=df_test[['ds', 'y']], on=['ds'], how='left')

    # ============================================================================ #

    metrics = {}
    metrics['LSTM'] = {'sMAPE': smape(df_merged.y, df_merged.LSTM),
                    'RMSE': rmse(df_merged.y, df_merged.LSTM),
                    'MAE' : mae(df_merged.y, df_merged.LSTM)}
    metrics['NBEATSx'] = {'sMAPE': smape(df_merged.y, df_merged.NBEATSx),
                        'RMSE': rmse(df_merged.y, df_merged.NBEATSx),
                        'MAE' : mae(df_merged.y, df_merged.NBEATSx)}
    df_tbl_v = pd.DataFrame(metrics).T.reset_index(names="Modelo")

    # ============================================================================ #

    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, specs=[[{"type": "scatter"}], [{"type": "table"}]])

    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.y, mode='lines', name='observado', line=dict(color="black", width=4)), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.LSTM, mode='lines', name='LSTM', line=dict(color="darkorange")), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_merged.ds, y=df_merged.NBEATSx, mode='lines', name='NBEATSx', line=dict(color="olive")), row=1, col=1)
    
    fig.append_trace(go.Table(header=dict(values=df_tbl_v.columns.to_list(), font=dict(size=14), align="center"),
                                    cells=dict(values=df_tbl_v.T, font=dict(size=14), height=24, align="left")),
                            row=2, col=1)
    
    fig.update_yaxes(title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)))
    fig.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))
    
    fig.update_traces(hovertemplate=None, row=1, col=1)
    
    fig.update_layout(width=1500, height=1000, hovermode='x unified',
                    title=dict(text="Redes Neurais não otimizadas (fch = {f})".format(f=f),
                    font=dict(family="system-ui", size=24)))

    fig.write_image("./resultados/trecho_alto/fch{fh}/naoopt/dl.png".format(fh=f))
    # fig.show()

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5
Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

#### Otimizado

In [58]:
def opt_lstm(trial, fh):
    params = {
        'encoder_hidden_size': trial.suggest_categorical('encoder_hidden_size', [8, 16, 32, 64, 128, 256]),
        'decoder_hidden_size': trial.suggest_categorical('decoder_hidden_size', [8, 16, 32, 64, 128, 256]),
        'encoder_n_layers': trial.suggest_categorical('encoder_n_layers', [1, 2, 3, 4]),
        'decoder_layers': trial.suggest_categorical('decoder_layers', [1, 2, 3, 4]),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.5),
        'context_size': trial.suggest_int('context_size', 1, 3*look_back),
        'max_steps' : trial.suggest_int('max_steps', 100, 500, step=25)
    }

    local_scaler_type = trial.suggest_categorical('local_scaler_type', ["standard", "robust", "minmax"])

    model = [LSTM(random_seed=5, h=fh,
                  hist_exog_list=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
                  futr_exog_list=['t_cv_56425000', 't_cv_56338500', 't_cv_56110005','t_cv_56337200', 't_cv_56337500'],
                  scaler_type=None,
                  logger=False,
                  **params)]

    nfc = NeuralForecast(models=model, freq='D', local_scaler_type=local_scaler_type)
    nfc.fit(df=df_train)

    p = nfc.predict(futr_df=df_futr)
    df_result = pd.merge(left=p, right=df_test[['ds', 'y']], on=['ds'], how='left')

    loss = smape(df_result['y'], df_result['LSTM'])
    
    return loss

# ============================ #

def opt_nbeatsx(trial, fh):
    learning_rate       = trial.suggest_loguniform('learning_rate', 0.001, 0.5)
    activation          = trial.suggest_categorical('activation',  ["ReLU", "Softplus", "Tanh", "SELU", "LeakyReLU", "PReLU", "Sigmoid"])
    n_blocks1           = trial.suggest_int('n_blocks1', 1, 5)
    n_blocks2           = trial.suggest_int('n_blocks2', 1, 5)
    n_blocks3           = trial.suggest_int('n_blocks3', 1, 5)
    mlp_units           = trial.suggest_int('mlp_units', 16, 512, step=8)
    n_harmonics         = trial.suggest_int('n_harmonics', 1, 5)
    n_polynomials       = trial.suggest_int('n_polynomials', 1, 5)
    dropout_prob_theta  = trial.suggest_loguniform('dropout_prob_theta', 0.01, 0.2)
    input_size          = trial.suggest_int('input_size', 1, 3*look_back)
    max_steps           = trial.suggest_int('max_steps', 100, 500, step=25)

    local_scaler_type = trial.suggest_categorical('local_scaler_type', ["standard", "robust", "minmax"])

    modelo = [NBEATSx(random_seed=5, h=fh, max_steps=max_steps,
                    stack_types=['seasonality', 'trend', 'identity'],
                    n_blocks=[n_blocks1, n_blocks2, n_blocks3],
                    mlp_units=[[mlp_units,mlp_units], [mlp_units,mlp_units], [mlp_units,mlp_units]],
                    n_harmonics=n_harmonics,
                    n_polynomials=n_polynomials,
                    dropout_prob_theta=dropout_prob_theta,
                    activation=activation,
                    hist_exog_list=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
                    futr_exog_list=['t_cv_56425000', 't_cv_56338500', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500'],
                    learning_rate=learning_rate,
                    input_size=input_size,
                    scaler_type=None,
                    logger=False)]

    nfc_opt = NeuralForecast(models=modelo, freq='D', local_scaler_type=local_scaler_type)
    nfc_opt.fit(df=df_train)

    p = nfc_opt.predict(futr_df=df_futr)
    df_result = pd.merge(left=p, right=df_test[['ds', 'y']], on=['ds'], how='left')

    loss = smape(df_result['y'], df_result['NBEATSx'])
    
    return loss

# ============================ #

# Guardar os parâmetros apenas das melhores trials
lstm_best_trial = {}
nbeatsx_best_trial = {}

for f in fch_v:
    # Criando o estudo e executando a otimização
    study_lstm = opt.create_study(direction='minimize', sampler=opt.samplers.TPESampler(seed=5))
    study_nbeatsx = opt.create_study(direction='minimize', sampler=opt.samplers.TPESampler(seed=5))
    
    opt_lstm = partial(opt_lstm, fh=f)
    study_lstm.optimize(opt_lstm, n_trials=20, timeout=1000, catch=(FloatingPointError, ValueError, ))

    opt_nbeatsx = partial(opt_nbeatsx, fh=f)
    study_nbeatsx.optimize(opt_nbeatsx, n_trials=20, timeout=1000, catch=(FloatingPointError, ValueError, ))

    lstm_best_trial[fch_v.index(f)] = {'fch' : f,
                                    'best_value': study_lstm.best_value,
                                    'best_params': study_lstm.best_params}

    nbeatsx_best_trial[fch_v.index(f)] = {'fch' : f,
                                        'best_value': study_nbeatsx.best_value,
                                        'best_params': study_nbeatsx.best_params}
    
# lstm_best_trial, nbeatsx_best_trial
    
# Reproduzindo as melhores trials
for f, i, _ in zip(fch_v, lstm_best_trial, nbeatsx_best_trial):
    m_lstm = [LSTM(random_seed=5, h=f,
                    max_steps=lstm_best_trial[i]['best_params']['max_steps'],
                    futr_exog_list=['t_cv_56425000', 't_cv_56338500', 't_cv_56110005','t_cv_56337200', 't_cv_56337500'],
                    learning_rate=lstm_best_trial[i]['best_params']['learning_rate'],
                    encoder_hidden_size=lstm_best_trial[i]['best_params']['encoder_hidden_size'],
                    encoder_n_layers=lstm_best_trial[i]['best_params']['encoder_n_layers'],
                    decoder_hidden_size=lstm_best_trial[i]['best_params']['decoder_hidden_size'],
                    decoder_layers=lstm_best_trial[i]['best_params']['decoder_layers'],
                    context_size=lstm_best_trial[i]['best_params']['context_size'],
                    logger=False)]

    nfc_lstm = NeuralForecast(models=m_lstm, freq='D', local_scaler_type=lstm_best_trial[i]['best_params']['local_scaler_type'])
    nfc_lstm.fit(df=df_train)

    p = nfc_lstm.predict(futr_df=df_futr)
    df_result = pd.merge(left=p, right=df_test[['ds', 'y']], on=['ds'], how='left')

    # ============================================================================ #

    m_nbx = [NBEATSx(random_seed=5, h=f,
                    max_steps=nbeatsx_best_trial[i]['best_params']['max_steps'],
                    stack_types=['seasonality', 'trend', 'identity'],
                    n_blocks=[nbeatsx_best_trial[i]['best_params']['n_blocks1'],
                            nbeatsx_best_trial[i]['best_params']['n_blocks2'],
                            nbeatsx_best_trial[i]['best_params']['n_blocks3']],
                    mlp_units=[[nbeatsx_best_trial[i]['best_params']['mlp_units'], nbeatsx_best_trial[i]['best_params']['mlp_units']],
                            [nbeatsx_best_trial[i]['best_params']['mlp_units'], nbeatsx_best_trial[i]['best_params']['mlp_units']],
                            [nbeatsx_best_trial[i]['best_params']['mlp_units'], nbeatsx_best_trial[i]['best_params']['mlp_units']]],
                    n_harmonics=nbeatsx_best_trial[i]['best_params']['n_harmonics'],
                    n_polynomials=nbeatsx_best_trial[i]['best_params']['n_polynomials'],
                    dropout_prob_theta=nbeatsx_best_trial[i]['best_params']['dropout_prob_theta'],
                    activation=nbeatsx_best_trial[i]['best_params']['activation'],
                    hist_exog_list=['t_vz_56338500', 't_vz_56110005', 't_vz_56337200', 't_vz_56337500'],
                    futr_exog_list=['t_cv_56425000', 't_cv_56338500', 't_cv_56110005', 't_cv_56337200', 't_cv_56337500'],
                    learning_rate=nbeatsx_best_trial[i]['best_params']['learning_rate'],
                    input_size=nbeatsx_best_trial[i]['best_params']['input_size'],
                    logger=False)]

    nfc_nbx = NeuralForecast(models=m_nbx, freq='D', local_scaler_type=nbeatsx_best_trial[i]['best_params']['local_scaler_type'])
    nfc_nbx.fit(df=df_train)

    p = nfc_nbx.predict(futr_df=df_futr)
    df_result = pd.merge(left=p, right=df_result, on=['ds'], how='left')

    # ============================================================================ #

    metrics = {}
    metrics['LSTM'] = {'sMAPE': smape(df_result.y, df_result.LSTM),
                    'RMSE': rmse(df_result.y, df_result.LSTM),
                    'MAE' : mae(df_result.y, df_result.LSTM)}
    metrics['NBEATSx'] = {'sMAPE': smape(df_result.y, df_result.NBEATSx),
                        'RMSE': rmse(df_result.y, df_result.NBEATSx),
                        'MAE' : mae(df_result.y, df_result.NBEATSx)}
    df_tbl = pd.DataFrame(metrics).T.reset_index(names="Modelo")

    # ============================================================================ #

    fig = make_subplots(rows=2, cols=1, vertical_spacing=0.2, specs=[[{"type": "scatter"}], [{"type": "table"}]])
    
    fig.add_trace(go.Scatter(x=df_result['ds'], y=df_result['y'], mode='lines', name='observado', line=dict(color='black', width=4)), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_result['ds'], y=df_result['LSTM'], mode='lines', name='LSTM', line=dict(color='darkorange')), row=1, col=1)
    fig.add_trace(go.Scatter(x=df_result['ds'], y=df_result['NBEATSx'], mode='lines', name='NBEATSx', line=dict(color='olive')), row=1, col=1)

    fig.append_trace(go.Table(header=dict(values=df_tbl.columns.to_list(), font=dict(size=14), align="center"),
                            cells=dict(values=df_tbl.T, font=dict(size=14), height=24, align="left")),
                    row=2, col=1)

    fig.update_yaxes(title=dict(text="Vazão (m³/s)", font=dict(family="system-ui", size=18)))
    fig.update_xaxes(title=dict(text="Período", font=dict(family="system-ui", size=18)))

    fig.update_traces(hovertemplate=None, row=1, col=1)

    fig.update_layout(width=1500, height=1000, hovermode='x unified',
                    title=dict(text="Rede Neurais otimizadas (fch = {f})".format(f=f),
                    font=dict(family="system-ui", size=24)))

    fig.write_image("./resultados/trecho_alto/fch{fh}/opt/dl.png".format(fh=f))
    # fig.show()

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

[W 2024-03-27 17:59:39,576] Trial 5 failed with parameters: {'learning_rate': 0.45588433271619744, 'activation': 'ReLU', 'n_blocks1': 5, 'n_blocks2': 5, 'n_blocks3': 3, 'mlp_units': 464, 'n_harmonics': 2, 'n_polynomials': 1, 'dropout_prob_theta': 0.04844794404603945, 'input_size': 14, 'max_steps': 475, 'local_scaler_type': 'standard'} because of the following error: ValueError('Input contains NaN.').
Traceback (most recent call last):
  File "c:\Users\welson\anaconda3\envs\dissertacao_py39\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\welson\AppData\Local\Temp\ipykernel_15020\2300118051.py", line 69, in opt_nbeatsx
    loss = smape(df_result['y'], df_result['NBEATSx'])
  File "c:\Users\welson\anaconda3\envs\dissertacao_py39\lib\site-packages\sktime\performance_metrics\forecasting\_classes.py", line 169, in __call__
    return self.evaluate(y_true, y_pred, **kwargs)
  File "c:\Users\welson\anaconda3\envs\disserta

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

[W 2024-03-27 18:16:10,067] Trial 5 failed with parameters: {'encoder_hidden_size': 32, 'decoder_hidden_size': 128, 'encoder_n_layers': 4, 'decoder_layers': 3, 'learning_rate': 0.4597670034170664, 'context_size': 3, 'max_steps': 325, 'local_scaler_type': 'robust'} because of the following error: ValueError('Input contains NaN.').
Traceback (most recent call last):
  File "c:\Users\welson\anaconda3\envs\dissertacao_py39\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\welson\AppData\Local\Temp\ipykernel_15020\2300118051.py", line 27, in opt_lstm
    loss = smape(df_result['y'], df_result['LSTM'])
  File "c:\Users\welson\anaconda3\envs\dissertacao_py39\lib\site-packages\sktime\performance_metrics\forecasting\_classes.py", line 169, in __call__
    return self.evaluate(y_true, y_pred, **kwargs)
  File "c:\Users\welson\anaconda3\envs\dissertacao_py39\lib\site-packages\sktime\performance_metrics\forecasting\_classes.py"

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Predicting: 0it [00:00, ?it/s]

Global seed set to 5


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

# HydroBR

In [None]:
# help(hbr.get_data.ANA)

In [None]:
# estacoes_inmet = hbr.get_data.INMET.list_stations(station_type='both')

In [None]:
# estacoes_inmet

In [None]:
# estacoes_inmet.query("Code == 'A255'")

In [None]:
# cod = 'A255'
# df_dados = hbr.get_data.INMET.daily_data(station_code=cod)

In [None]:
# df_dados

In [None]:
# estacoes_ana_vazao = hbr.get_data.ANA.list_flow(state='MINAS GERAIS', source='ANA')

In [None]:
# estacoes_ana_vazao

In [None]:
# estacoes de vazão tbm e cota
# 56425000 56338500 56338080 56110005 56337200 56337500

# estacoes_ana_vazao.query("Code == '56337500'")

In [None]:
# estacao_principal = '56338500'
# df_result = get_convencional(codEstacao=estacao_principal,
#                              dataInicio='2013-01-01',
#                              dataFim='2023-12-31',
#                              tipoDados=1,
#                              nivelConsistencia='')

# df_result

# df_result.index = pd.to_datetime(df_result.index)
# df_result.Cota = pd.to_numeric(df_result.Cota, errors='coerce')
# df_result.Chuva = pd.to_numeric(df_result.Chuva, errors='coerce')
# df_result.Vazao = pd.to_numeric(df_result.Vazao, errors='coerce')

# df_result = df_result.resample('D').agg({'Cota': 'sum', 'Chuva': 'mean', 'Vazao': 'mean'})

# df_result.index.name

# df_result.columns = ['t_ct_'+str(estacao_principal), 't_cv_'+str(estacao_principal), 't_vz_'+str(estacao_principal)]

# # Agora que já tenho os dados da estação que considero principal na análise (target)
# #   vou agregar com os dados das demais estações
# list_estacoes_tele = ['56338500', '56338080', '56110005', '56337200', '56337500']

# for e in list_estacoes_tele:
#     df_temp = get_convencional(codEstacao=e, dataInicio="2013-01-01", dataFim="2023-12-31")

#     # Convertendo os dados
#     df_temp.index = pd.to_datetime(df_temp.index)
#     df_temp.Cota = pd.to_numeric(df_temp.Cota, errors='coerce')
#     df_temp.Chuva = pd.to_numeric(df_temp.Chuva, errors='coerce')
#     df_temp.Vazao = pd.to_numeric(df_temp.Vazao, errors='coerce')

#     # Para as telemétricas já agrego aqui mesmo
#     df_temp = df_temp.resample('D').agg({'Cota': 'sum', 'Chuva': 'mean', 'Vazao': 'mean'})

#     # Ajeito os nomes das colunas pra conter de qual estacao os dado veio
#     df_temp.columns = ['t_ct_'+e, 't_cv_'+e, 't_vz_'+e]

#     df_result = pd.concat([df_result, df_temp], axis=1)

In [None]:
# df_result