In [32]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split

## Gerar observações da vazão observada presente nos dados de treinamento e teste p/ os datasets das redes neurais e algoritmos tradicionais

In [33]:
RANDOM_SEED=21

path_datasets = "../../../../data/datasets"
dataset = "Itaipu_POC_VAZAO_V2.csv"

## Número de Semanas Operativas Retroativas a serem utilizadas no Treinamento dos Algoritmos. min(n)=1
n = 3

## Número da Semana Operativa Futura da Vazão a ser prevista pelos Modelos. min(f)=1
f = 4

df = pd.read_csv(f'{path_datasets}/{dataset}', index_col='time')

### Redes Neurais

In [34]:
def dataset_constructor(df, f):
    df['time'] = df.index
    df.reset_index(drop=True,inplace=True)
    df[f'bacia_prec_sum_shift_f={f}'] = df['bacia_prec_sum'].shift(-f)
    df = df.dropna()

    return df

In [35]:
def split_sequences(sequences, n_steps, f_pred):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix+(f_pred-1) >= len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix+(f_pred-1), [0,2]]
        X.append(seq_x)
        y.append(seq_y)
        
    return np.array(X), np.array(y)

In [36]:
def scaling_data(df, f_pred):
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Columns to scale for X and y
    columns_to_scale_X = [f'bacia_prec_sum_shift_f={f_pred}']
    columns_to_scale_y = ['vazao_itaipu']

    # Fit scalers on the selected columns and transform
    scaled_data_X = scaler_X.fit_transform(df[columns_to_scale_X])
    scaled_data_y = scaler_y.fit_transform(df[columns_to_scale_y])

    # Create DataFrame with scaled data
    scaled_X = pd.DataFrame(scaled_data_X, columns=columns_to_scale_X)
    scaled_y = pd.DataFrame(scaled_data_y, columns=columns_to_scale_y)

    # Concatenate scaled columns to the original DataFrame
    new_df = pd.concat([df.time, scaled_X, scaled_y], axis=1)

    return new_df, scaler_y

In [37]:
df_ = dataset_constructor(df.copy(), f)

new_df, scaler_y = scaling_data(df_, f)

# Prepare X and y data and apply train_test_split
X, y = split_sequences(new_df.values, n, f)

X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

In [38]:
y_test_rnn = scaler_y.inverse_transform(y_test_[:,1].reshape(-1, 1))
y_test_rnn.flatten()

array([283389.62, 154792.  , 338344.25, 628937.32, 138688.  , 161897.  ,
       237035.  , 213983.  , 308413.  , 258605.  , 339541.  , 218337.  ,
       476448.25, 366615.  , 218477.49, 275931.  , 264498.88, 184082.51,
       305835.29, 204050.  , 345457.  , 230149.  , 158441.  , 194126.  ,
       123459.  , 468774.28, 273475.  , 254487.  , 533310.  , 331902.  ,
       228656.29, 414705.  , 489570.  , 147157.  , 208935.  , 371167.  ,
       539437.  , 246194.  , 359743.  , 266482.  , 314912.36, 236466.  ,
       138822.02, 279982.92, 116841.69, 305608.  , 361193.  , 434023.19,
       484424.  , 566502.  ])

In [39]:
y_train_rnn = scaler_y.inverse_transform(y_train_[:,1].reshape(-1, 1))
y_train_rnn.flatten()

array([196265.99, 346448.  , 397771.  , 354116.6 , 170665.  , 390421.85,
       170897.  , 271728.  , 795036.  , 348775.  , 675757.22, 364509.  ,
       387494.6 , 436733.  , 454792.83, 387207.  , 545551.  , 453604.58,
       151270.6 , 214026.  , 395721.36, 507905.  , 426478.  , 212995.  ,
       187216.  , 218043.  , 176525.  , 257165.  , 412263.  , 455530.  ,
       309303.  , 220310.84, 265053.49, 282697.  , 226452.  , 598720.  ,
       607613.78, 547578.  , 356338.67, 266748.  , 326354.  , 209375.76,
       441899.76, 252466.96, 304441.  , 221146.81, 247254.39, 334233.65,
       340514.  , 246920.  , 180028.  , 481222.12, 335007.  , 421548.47,
       196825.1 , 457776.  , 307391.  , 229624.  , 306871.  , 138291.  ,
       362113.13, 226314.19, 380658.  , 507990.  , 194329.  , 301142.  ,
       264011.  , 426968.38, 160122.  , 412779.  , 212426.  , 192966.  ,
       304510.  , 686481.88, 184438.  , 834043.  , 278731.  , 195960.38,
       333597.  , 244671.08, 331568.86, 527782.  , 

In [40]:
y_plot_rnn = []

for n in range(1,8+1):
    for f in range(1,8+1):
        df_ = dataset_constructor(df.copy(), f)

        new_df, scaler_y = scaling_data(df_, f)

        # Prepare X and y data and apply train_test_split
        X, y = split_sequences(new_df.values, n, f)

        X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

        y_test_rnn = scaler_y.inverse_transform(y_test_[:,1].reshape(-1, 1)).flatten()
        y_train_rnn = scaler_y.inverse_transform(y_train_[:,1].reshape(-1, 1)).flatten()

        y_plot_rnn.append({'n':n, 
                           'f':f, 
                           'y_test_rnn':y_test_rnn, 
                           'y_train_rnn':y_train_rnn,
                           'f_plot_test_rnn': [f'F={f}' for _ in range(len(y_test_rnn))],
                           'f_plot_train_rnn': [f'F={f}' for _ in range(len(y_train_rnn))],
                        })

In [41]:
# Organizing data for Subplot 1

x_train_rnn_1 = []
y_train_rnn_1 = []

x_test_rnn_1 = []
y_test_rnn_1 = []

for f in range(0,4):

    x_train_rnn_1.extend(y_plot_rnn[f]['f_plot_train_rnn'])
    y_train_rnn_1.extend(y_plot_rnn[f]['y_train_rnn'])
    
    x_test_rnn_1.extend(y_plot_rnn[f]['f_plot_test_rnn'])
    y_test_rnn_1.extend(y_plot_rnn[f]['y_test_rnn'])


# Organizing data for Subplot 2

x_train_rnn_2 = []
y_train_rnn_2 = []

x_test_rnn_2 = []
y_test_rnn_2 = []

for f in range(4,8):

    x_train_rnn_2.extend(y_plot_rnn[f]['f_plot_train_rnn'])
    y_train_rnn_2.extend(y_plot_rnn[f]['y_train_rnn'])

    x_test_rnn_2.extend(y_plot_rnn[f]['f_plot_test_rnn'])
    y_test_rnn_2.extend(y_plot_rnn[f]['y_test_rnn'])

### Algoritmos Tradicionais

In [42]:
def dataset_constructor(df, n, f):
    for i in range(1, n): 
        df[f'bacia_prec_sum (time - {i})'] = df['bacia_prec_sum'].shift(i)
        df[f'vazao_itaipu (time - {i})'] = df['vazao_itaipu'].shift(i)

    df['bacia_prec_sum (time)'] = df['bacia_prec_sum']
    df['vazao_itaipu (time)'] = df['vazao_itaipu']

    for i in range(1,f+1):
        df[f'bacia_prec_sum (time + {i})'] = df['bacia_prec_sum'].shift(-i)
        
    df[f'vazao_itaipu (time + {f})'] = df['vazao_itaipu'].shift(-f)

    df = df.drop(columns=['bacia_prec_sum','vazao_itaipu'])
    df = df.dropna()
    
    return df

In [43]:
def scaling_data(df):
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Columns to scale for X and y
    columns_to_scale_X = df.columns[:-1]
    columns_to_scale_y = df.columns[-1]

    # Fit scalers on the selected columns and transform
    scaled_data_X = scaler_X.fit_transform(df[columns_to_scale_X])
    scaled_data_y = scaler_y.fit_transform(df[[columns_to_scale_y]])

    # Create DataFrame with scaled data
    scaled_X = pd.DataFrame(scaled_data_X, columns=columns_to_scale_X)
    scaled_y = pd.DataFrame(scaled_data_y, columns=[columns_to_scale_y])

    # Concatenate scaled columns to the original DataFrame
    new_df = pd.concat([pd.DataFrame(df.index), scaled_X, scaled_y], axis=1)
    new_df.set_index('time', inplace=True)

    return new_df, scaler_y

In [44]:
df_poc = dataset_constructor(df.copy(), n, f)

df_poc, scaler_y = scaling_data(df_poc)

# Prepare X and y data and apply train_test_split
X_data = df_poc.iloc[:,:-1].astype('float64')
y_data = df_poc.iloc[:,-1:].astype('float64')

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)

In [45]:
y_test_trd = scaler_y.inverse_transform(y_test)
y_test_trd.flatten()

array([585931.32, 114366.  , 333597.  , 365784.43, 334233.65, 395721.36,
       158441.  , 421548.47, 256894.  , 547578.  , 489570.  , 220229.  ,
       396141.24, 170665.  , 266482.  , 309303.  , 335007.  , 209768.  ,
       414699.45, 422279.  , 387494.6 , 532718.  , 327206.84, 359743.  ,
       209375.76, 731722.  , 559084.  , 507990.  , 184438.  , 254487.  ,
       265053.49, 271728.  , 280684.05, 307391.  , 288386.  , 356463.  ,
       301142.  , 218477.49, 498288.  , 453604.58, 346448.  , 447605.02,
       264011.  , 390421.85, 795036.  , 527782.  , 507905.  , 244671.08,
       653608.  ])

In [46]:
y_train_trd = scaler_y.inverse_transform(y_train)
y_train_trd.flatten()

array([180028.  , 365971.  , 236466.  , 235524.  , 300097.  , 288689.  ,
       199741.  , 310014.67, 220310.84, 218476.71, 282697.  , 230123.  ,
       338344.25, 278731.  , 333937.54, 514745.  , 208935.  , 169278.  ,
       340038.  , 485687.  , 397771.  , 426968.38, 472270.  , 544628.  ,
       288389.38, 260427.  , 226314.19, 455530.  , 308413.  , 414705.  ,
       449802.78, 194329.  , 345457.  , 339541.  , 192966.  , 675757.22,
       176525.  , 354116.6 , 566502.  , 315822.  , 361193.  , 195960.38,
       367064.2 , 331568.86, 376219.  , 413544.  , 230034.  , 279982.92,
       308561.2 , 686481.88, 350907.  , 253268.46, 348114.  , 470858.  ,
       184082.51, 506478.  , 364509.  , 116841.69, 226452.  , 212426.  ,
       260072.  , 338221.  , 527851.  , 401072.23, 259414.  , 190672.  ,
       484424.  , 387207.  , 374419.  , 242644.  , 352391.  , 149065.  ,
       242014.  , 326354.  , 160122.  , 847468.56, 499474.62, 138822.02,
       278007.72, 273475.  , 216039.  , 603778.  , 

In [47]:
y_plot_trd = []

for n in range(1,8+1):
    for f in range(1,8+1):
        df_poc = dataset_constructor(df.copy(), n, f)

        df_poc, scaler_y = scaling_data(df_poc)

        # Prepare X and y data and apply train_test_split
        X_data = df_poc.iloc[:,:-1].astype('float64')
        y_data = df_poc.iloc[:,-1:].astype('float64')

        X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)

        y_test_trd = scaler_y.inverse_transform(y_test).flatten()
        y_train_trd = scaler_y.inverse_transform(y_train).flatten()

        y_plot_trd.append({'n':n, 
                    'f':f, 
                    'y_test_trd':y_test_trd, 
                    'y_train_trd':y_train_trd,
                    'f_plot_test_trd': [f'F={f}' for _ in range(len(y_test_trd))],
                    'f_plot_train_trd': [f'F={f}' for _ in range(len(y_train_trd))],
                })

In [48]:
# Organizing data for Subplot 1

x_train_trd_1 = []
y_train_trd_1 = []

x_test_trd_1 = []
y_test_trd_1 = []

for f in range(0,4):

    x_train_trd_1.extend(y_plot_trd[f]['f_plot_train_trd'])
    y_train_trd_1.extend(y_plot_trd[f]['y_train_trd'])
    
    x_test_trd_1.extend(y_plot_trd[f]['f_plot_test_trd'])
    y_test_trd_1.extend(y_plot_trd[f]['y_test_trd'])


# Organizing data for Subplot 2

x_train_trd_2 = []
y_train_trd_2 = []

x_test_trd_2 = []
y_test_trd_2 = []

for f in range(4,8):

    x_train_trd_2.extend(y_plot_trd[f]['f_plot_train_trd'])
    y_train_trd_2.extend(y_plot_trd[f]['y_train_trd'])

    x_test_trd_2.extend(y_plot_trd[f]['f_plot_test_trd'])
    y_test_trd_2.extend(y_plot_trd[f]['y_test_trd'])

In [107]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

fig = make_subplots(rows=2, cols=1)

# Subplot 1

color1 = '#11992a'
color2 = '#005c11'
color3 = '#FF4136'
color4 = '#ba1600'

fig.add_trace(go.Box(
    y=y_train_trd_1,
    x=x_train_trd_1,
    name='TRAD_TREINO',
    marker_color=color1
), row=1, col=1)
fig.add_trace(go.Box(
    y=y_test_trd_1,
    x=x_test_trd_1,
    name='TRAD_TESTE',
    marker_color=color2
), row=1, col=1)
fig.add_trace(go.Box(
    y=y_train_rnn_1,
    x=x_train_rnn_1,
    name='RNN_TREINO',
    marker_color=color3
), row=1, col=1)
fig.add_trace(go.Box(
    y=y_test_rnn_1,
    x=x_test_rnn_1,
    name='RNN_TESTE',
    marker_color=color4
), row=1, col=1)

# Subplot 2

fig.add_trace(go.Box(
    y=y_train_trd_2,
    x=x_train_trd_2,
    name='TRAD_TREINO',
    marker_color=color1,
    showlegend=False
), row=2, col=1)
fig.add_trace(go.Box(
    y=y_test_trd_2,
    x=x_test_trd_2,
    name='TRAD_TESTE',
    marker_color=color2,
    showlegend=False
), row=2, col=1)
fig.add_trace(go.Box(
    y=y_train_rnn_2,
    x=x_train_rnn_2,
    name='RNN_TREINO',
    marker_color=color3,
    showlegend=False
), row=2, col=1)
fig.add_trace(go.Box(
    y=y_test_rnn_2,
    x=x_test_rnn_2,
    name='RNN_TESTE',
    marker_color=color4,
    showlegend=False
), row=2, col=1)

# Set the same y-axis range for both subplots
# fig.update_yaxes(range=[0, 1.0], row=1, col=1)
# fig.update_yaxes(range=[0, 1.0], row=2, col=1)

fig.update_layout(
        title=("Boxplot da vazão mensal acumulada nos dados de treino e teste" 
               " para os Algoritmos Tradicionais (TRAD) e Redes Neurais (RNN) onde 'N'=1"),
        boxmode='group', # group together boxes of the different traces for each value of x
        # yaxis_title='Vazão Acumulada Mensal (m^3)/s',
        # how can i move yaxis_title down, to fit the center of the y axis?
        # yaxis=dict(title_standoff=20),
)

# Add annotation for y-axis title
fig.add_annotation(
    text='Vazão Acumulada Mensal (m^3)/s',
    xref='paper', 
    yref='paper',
    x=-0.05, y=0.5,  # Adjust x and y position to move the title away from the y-axis
    xanchor='center', 
    yanchor='middle',
    showarrow=False,
    textangle=-90  # Rotate the text to be vertical
)

# Add annotations
fig.add_annotation(text="'N' = Número de meses retroativos de vazão observada e precipitação na bacia, a serem utilizadas durante o treinamento",
                   xref="paper", yref="paper",
                   x=0.5, y=-0.18, showarrow=False)

# Add annotations
fig.add_annotation(text="'F' = Número do mês futuro a onde se prevê a vazão",
                   xref="paper", yref="paper",
                   x=0.5, y=-0.25, showarrow=False)

fig.show()

# ----- Testes/Rascunho ----- #

### Boxplots

In [20]:
import random

def generate_float_list(n):
    float_list = [random.random() for _ in range(n)]
    return float_list

# Example: generate a list with 5 values
generate_float_list(n)

import plotly.graph_objects as go

n = 24

x = ['day 1', 'day 1', 'day 1', 'day 1', 'day 1', 'day 1',
     'day 2', 'day 2', 'day 2', 'day 2', 'day 2', 'day 2',
     'day 3', 'day 3', 'day 3', 'day 3', 'day 3', 'day 3',
     'day 4', 'day 4', 'day 4', 'day 4', 'day 4', 'day 4']

fig = go.Figure()

fig.add_trace(go.Box(
    y=generate_float_list(n),
    x=x,
    name='kale',
    marker_color='#3D9970'
))
fig.add_trace(go.Box(
    y=generate_float_list(n),
    x=x,
    name='radishes',
    marker_color='#FF4136'
))
fig.add_trace(go.Box(
    y=generate_float_list(n),
    x=x,
    name='carrots',
    marker_color='#FF851B'
))
fig.add_trace(go.Box(
    y=generate_float_list(n),
    x=x,
    name='eggs',
    marker_color='#F6854D'
))

fig.update_layout(
    yaxis_title='normalized moisture',
    boxmode='group' # group together boxes of the different traces for each value of x
)
fig.show()