In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
import dtreeviz
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os



### Prepara o dataset ###

In [2]:
RANDOM_SEED=21

model = "linear_regression"
src_type = "regular"

dir_results = f"../../data/results/{src_type}"
dir_figures = f"{dir_results}/figures/{model}"

if not os.path.exists(dir_figures):
    os.makedirs(dir_figures)

path_datasets = "../../data/datasets"
dataset = "Itaipu_POC_VAZAO_V3.csv"

## Número de Semanas Operativas Retroativas a serem utilizadas no Treinamento dos Algoritmos
n = 10

## Número da Semana Operativa Futura da Vazão a ser prevista pelos Modelos. min(f)=1
f = 4

In [3]:
df = pd.read_csv(f'{path_datasets}/{dataset}', index_col='time')
df

Unnamed: 0_level_0,bacia_prec_sum,vazao_itaipu
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2001-01-05,48022.5000,103944.00
2001-01-12,94990.3125,96628.00
2001-01-19,56717.3750,113552.00
2001-01-26,50552.3750,84168.00
2001-02-02,55272.6250,81859.00
...,...,...
2020-12-04,77801.3750,39122.11
2020-12-11,44634.2500,37020.23
2020-12-18,98379.9375,46404.99
2020-12-25,19194.4375,63216.66


In [4]:
def dataset_constructor(df, n, f):
    for i in range(1, n): 
        df[f'bacia_prec_sum (time - {i})'] = df['bacia_prec_sum'].shift(i)
        df[f'vazao_itaipu (time - {i})'] = df['vazao_itaipu'].shift(i)

    df['bacia_prec_sum (time)'] = df['bacia_prec_sum']
    df['vazao_itaipu (time)'] = df['vazao_itaipu']

    for i in range(1,f+1):
        df[f'bacia_prec_sum (time + {i})'] = df['bacia_prec_sum'].shift(-i)
        
    df[f'vazao_itaipu (time + {f})'] = df['vazao_itaipu'].shift(-f)

    df = df.drop(columns=['bacia_prec_sum','vazao_itaipu'])
    df = df.dropna()
    
    return df

In [5]:
df = dataset_constructor(df, n, f)
df

Unnamed: 0_level_0,bacia_prec_sum (time - 1),vazao_itaipu (time - 1),bacia_prec_sum (time - 2),vazao_itaipu (time - 2),bacia_prec_sum (time - 3),vazao_itaipu (time - 3),bacia_prec_sum (time - 4),vazao_itaipu (time - 4),bacia_prec_sum (time - 5),vazao_itaipu (time - 5),...,vazao_itaipu (time - 8),bacia_prec_sum (time - 9),vazao_itaipu (time - 9),bacia_prec_sum (time),vazao_itaipu (time),bacia_prec_sum (time + 1),bacia_prec_sum (time + 2),bacia_prec_sum (time + 3),bacia_prec_sum (time + 4),vazao_itaipu (time + 4)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-03-09,28065.0000,116785.00,105377.8750,130217.00,120903.0000,96852.00,84943.3125,98723.00,55272.6250,81859.00,...,96628.00,48022.5000,103944.00,45799.4375,86866.00,27535.7500,70843.2500,47263.5000,22682.6250,83568.00
2001-03-16,45799.4375,86866.00,28065.0000,116785.00,105377.8750,130217.00,120903.0000,96852.00,84943.3125,98723.00,...,113552.00,94990.3125,96628.00,27535.7500,94284.00,70843.2500,47263.5000,22682.6250,54798.8125,82147.00
2001-03-23,27535.7500,94284.00,45799.4375,86866.00,28065.0000,116785.00,105377.8750,130217.00,120903.0000,96852.00,...,84168.00,56717.3750,113552.00,70843.2500,95711.00,47263.5000,22682.6250,54798.8125,1542.1250,65965.00
2001-03-30,70843.2500,95711.00,27535.7500,94284.00,45799.4375,86866.00,28065.0000,116785.00,105377.8750,130217.00,...,81859.00,50552.3750,84168.00,47263.5000,81772.00,22682.6250,54798.8125,1542.1250,69923.6250,59837.00
2001-04-06,47263.5000,81772.00,70843.2500,95711.00,27535.7500,94284.00,45799.4375,86866.00,28065.0000,116785.00,...,98723.00,55272.6250,81859.00,22682.6250,83568.00,54798.8125,1542.1250,69923.6250,6217.7500,62443.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-06,61228.6250,28659.38,3424.3125,25417.31,48993.9375,24034.60,9785.3750,26311.64,13917.2500,26098.60,...,36802.41,952.1250,47821.89,1779.8750,35336.95,20482.7500,40592.8750,1654.6875,77801.3750,39122.11
2020-11-13,1779.8750,35336.95,61228.6250,28659.38,3424.3125,25417.31,48993.9375,24034.60,9785.3750,26311.64,...,30527.57,18.6250,36802.41,20482.7500,33987.35,40592.8750,1654.6875,77801.3750,44634.2500,37020.23
2020-11-20,20482.7500,33987.35,1779.8750,35336.95,61228.6250,28659.38,3424.3125,25417.31,48993.9375,24034.60,...,27145.84,1540.1250,30527.57,40592.8750,31442.49,1654.6875,77801.3750,44634.2500,98379.9375,46404.99
2020-11-27,40592.8750,31442.49,20482.7500,33987.35,1779.8750,35336.95,61228.6250,28659.38,3424.3125,25417.31,...,26098.60,10959.2500,27145.84,1654.6875,37720.34,77801.3750,44634.2500,98379.9375,19194.4375,63216.66


In [6]:
y_original = df.iloc[:,-1:]
y_original

Unnamed: 0_level_0,vazao_itaipu (time + 4)
time,Unnamed: 1_level_1
2001-03-09,83568.00
2001-03-16,82147.00
2001-03-23,65965.00
2001-03-30,59837.00
2001-04-06,62443.00
...,...
2020-11-06,39122.11
2020-11-13,37020.23
2020-11-20,46404.99
2020-11-27,63216.66


In [7]:
def scaling_data(df):
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Columns to scale for X and y
    columns_to_scale_X = df.columns[:-1]
    columns_to_scale_y = df.columns[-1]

    # Fit scalers on the selected columns and transform
    scaled_data_X = scaler_X.fit_transform(df[columns_to_scale_X])
    scaled_data_y = scaler_y.fit_transform(df[[columns_to_scale_y]])

    # Create DataFrame with scaled data
    scaled_X = pd.DataFrame(scaled_data_X, columns=columns_to_scale_X)
    scaled_y = pd.DataFrame(scaled_data_y, columns=[columns_to_scale_y])

    # Concatenate scaled columns to the original DataFrame
    new_df = pd.concat([pd.DataFrame(df.index), scaled_X, scaled_y], axis=1)
    new_df.set_index('time', inplace=True)

    return new_df, scaler_y

In [8]:
df, scaler_y = scaling_data(df)
df

Unnamed: 0_level_0,bacia_prec_sum (time - 1),vazao_itaipu (time - 1),bacia_prec_sum (time - 2),vazao_itaipu (time - 2),bacia_prec_sum (time - 3),vazao_itaipu (time - 3),bacia_prec_sum (time - 4),vazao_itaipu (time - 4),bacia_prec_sum (time - 5),vazao_itaipu (time - 5),...,vazao_itaipu (time - 8),bacia_prec_sum (time - 9),vazao_itaipu (time - 9),bacia_prec_sum (time),vazao_itaipu (time),bacia_prec_sum (time + 1),bacia_prec_sum (time + 2),bacia_prec_sum (time + 3),bacia_prec_sum (time + 4),vazao_itaipu (time + 4)
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2001-03-09,0.139365,0.452474,0.523289,0.518001,0.600385,0.355233,0.421814,0.364360,0.274474,0.282091,...,0.347571,0.238471,0.383624,0.227432,0.306517,0.136737,0.351796,0.234702,0.112637,0.290428
2001-03-16,0.227432,0.306517,0.139365,0.452474,0.523289,0.518001,0.600385,0.355233,0.421814,0.364360,...,0.430973,0.471706,0.347571,0.136737,0.342705,0.351796,0.234702,0.112637,0.272121,0.283496
2001-03-23,0.136737,0.342705,0.227432,0.306517,0.139365,0.452474,0.523289,0.518001,0.600385,0.355233,...,0.286168,0.281649,0.430973,0.351796,0.349667,0.234702,0.112637,0.272121,0.007657,0.204554
2001-03-30,0.351796,0.349667,0.136737,0.342705,0.227432,0.306517,0.139365,0.452474,0.523289,0.518001,...,0.274789,0.251034,0.286168,0.234702,0.281666,0.112637,0.272121,0.007657,0.347229,0.174659
2001-04-06,0.234702,0.281666,0.351796,0.349667,0.136737,0.342705,0.227432,0.306517,0.139365,0.452474,...,0.357895,0.274474,0.274789,0.112637,0.290428,0.272121,0.007657,0.347229,0.030875,0.187372
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-11-06,0.304051,0.022562,0.017003,0.006745,0.243295,0.000000,0.048591,0.011108,0.069110,0.010069,...,0.052749,0.004727,0.107053,0.008837,0.055137,0.101713,0.201577,0.008216,0.386349,0.073603
2020-11-13,0.008837,0.055137,0.304051,0.022562,0.017003,0.006745,0.243295,0.000000,0.048591,0.011108,...,0.021826,0.000091,0.052749,0.101713,0.048554,0.201577,0.008216,0.386349,0.221646,0.063349
2020-11-20,0.101713,0.048554,0.008837,0.055137,0.304051,0.022562,0.017003,0.006745,0.243295,0.000000,...,0.005161,0.007647,0.021826,0.201577,0.036139,0.008216,0.386349,0.221646,0.488539,0.109132
2020-11-27,0.201577,0.036139,0.101713,0.048554,0.008837,0.055137,0.304051,0.022562,0.017003,0.006745,...,0.000000,0.054421,0.005161,0.008216,0.066765,0.386349,0.221646,0.488539,0.095315,0.191146


In [9]:
df_poc = df.copy()

## Fazendo uma divisão treino/teste nos dados de vazão e aplicando normalização das colunas

In [10]:
X_data = df_poc.iloc[:,:-1].astype('float64')#.iloc[3:-1,:-1]
y_data = df_poc.iloc[:,-1:].astype('float64')#.iloc[3:-1,-1:]

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((824, 24), (207, 24), (824, 1), (207, 1))

## Treinamento

In [11]:
lr = LinearRegression(
    fit_intercept=True
)

lr.fit(X_train, y_train)

## Retomamos a escala original dos dados

In [12]:
## _ stands for normalized data

y_pred_ = lr.predict(X_test)
y_pred = scaler_y.inverse_transform(y_pred_.reshape(-1,1))

y_test_ = y_test
y_test = scaler_y.inverse_transform(y_test)

In [13]:
y_test

array([[123873.1 ],
       [128547.82],
       [ 49942.  ],
       [ 49932.  ],
       [ 77632.  ],
       [ 40890.  ],
       [ 48517.  ],
       [ 66673.  ],
       [117294.76],
       [ 69628.  ],
       [ 40237.  ],
       [ 79509.42],
       [ 83064.  ],
       [118530.49],
       [127498.  ],
       [ 66938.  ],
       [ 95675.  ],
       [ 79304.28],
       [ 86597.44],
       [129489.  ],
       [ 80793.15],
       [100492.17],
       [ 44593.38],
       [ 73511.32],
       [101372.16],
       [ 99824.35],
       [ 62296.  ],
       [ 58490.  ],
       [106698.1 ],
       [ 57808.  ],
       [ 43566.  ],
       [ 67439.  ],
       [ 78629.  ],
       [ 62211.98],
       [ 46576.  ],
       [ 70021.  ],
       [ 69197.  ],
       [ 55207.02],
       [ 90053.62],
       [ 66064.  ],
       [ 88997.  ],
       [ 54047.13],
       [ 71112.  ],
       [161976.  ],
       [ 61244.  ],
       [ 64891.  ],
       [180868.  ],
       [ 54109.  ],
       [ 49400.  ],
       [ 77673.43],


In [14]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
corr = np.corrcoef(y_test.T, y_pred.T)[0, 1]

metrics_df = pd.DataFrame(
    columns=['MAE', 'MSE', 'RMSE', 'R2', 'Corr'],
    index=['Decision Tree']
)

metrics_df['MAE'] = mae
metrics_df['MSE'] = mse
metrics_df['RMSE'] = rmse
metrics_df['R2'] = r2
metrics_df['Corr'] = corr
metrics_df

Unnamed: 0,MAE,MSE,RMSE,R2,Corr
Decision Tree,12845.335634,300551200.0,17336.412704,0.684437,0.829425


In [15]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=X_data.index,
        y=y_original.values.ravel(), # vazão observada
        mode='lines',
        name='Vazão observada',
    )
)

fig.add_trace(
    go.Scatter(
        x=X_test.index,
        y=y_pred.ravel(), # vazão prevista
        mode='markers',
        name='Forecast',
    )
)

fig.update_layout(title=f'Predição - Itaipu')

fig.write_image(f"{dir_figures}/history_measured_vs_predicted_plot.png", width=1400, scale=1)

fig.show()

In [16]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=y_test.flatten(),
        y=y_pred.flatten(),
        mode='markers',
        marker=dict(color='blue', opacity=0.5, line=dict(color='black', width=1)),
        name='Measured vs Predicted'
    )
)


fig.add_trace(
    go.Scatter(
        x=[y_pred.min(), y_pred.max()],
        y=[y_pred.min(), y_pred.max()],
        mode='lines',
        line=dict(color='red', dash='dash'),
        name='Identity Line'
    )
)

fig.update_layout(
    title='Measured vs Predicted',
    xaxis=dict(title='y_true'),
    yaxis=dict(title='y_pred'),
    autosize=False,
    width=800,
    height=500,
    margin=dict(l=0, r=0, b=0, t=40),
    showlegend=True
)

fig.write_image(f"{dir_figures}/scattered_measured_vs_predicted_plot.png")

fig.show()
