In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
import os
import mlflow
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Itaipu_Benchmarking_Bacia_Incremental_V1_Corrigido")

RANDOM_SEED=21
model_ = "gru"
src_type = "benchmark"
label = "itaipu"

dir_results = f"../../data/results/{src_type}"
pred_type = 'so_prev' # previsão para a semana operacional seguinte

dir_rna = f'{dir_results}/rna'
if not os.path.exists(dir_rna):
    os.makedirs(dir_rna)

file_ann = f'{dir_rna}/{model_}_{pred_type}.h5' 
best_file_ann = f'{dir_rna}/best_{model_}_{pred_type}.h5'

path_datasets = "../../data/datasets"
dataset = "Itaipu_POC_VAZAO_V3.csv"

In [None]:
def dataset_constructor(df, f):
    df['time'] = df.index
    df.reset_index(drop=True,inplace=True)
    df[f'bacia_prec_sum_shift_f={f}'] = df['bacia_prec_sum'].shift(-f)
    df = df.dropna()

    return df

In [None]:
def split_sequences(sequences, n_steps, f_pred):
    X, y = list(), list()
    for i in range(len(sequences)):
        # find the end of this pattern
        end_ix = i + n_steps
        # check if we are beyond the dataset
        if end_ix+(f_pred-1) >= len(sequences):
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = sequences[i:end_ix, :], sequences[end_ix+(f_pred-1), [0,2]]
        X.append(seq_x)
        y.append(seq_y)
        
    return np.array(X), np.array(y)

In [None]:
def scaling_data(df, f_pred):
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Columns to scale for X and y
    columns_to_scale_X = [f'bacia_prec_sum_shift_f={f_pred}']
    columns_to_scale_y = ['vazao_itaipu']

    # Fit scalers on the selected columns and transform
    scaled_data_X = scaler_X.fit_transform(df[columns_to_scale_X])
    scaled_data_y = scaler_y.fit_transform(df[columns_to_scale_y])

    # Create DataFrame with scaled data
    scaled_X = pd.DataFrame(scaled_data_X, columns=columns_to_scale_X)
    scaled_y = pd.DataFrame(scaled_data_y, columns=columns_to_scale_y)

    # Concatenate scaled columns to the original DataFrame
    new_df = pd.concat([df.time, scaled_X, scaled_y], axis=1)

    return new_df, scaler_y

In [None]:
def kge(evaluation, simulations):
    """Original Kling-Gupta Efficiency (KGE) and its three components
    (r, α, β) as per `Gupta et al., 2009
    <https://doi.org/10.1016/j.jhydrol.2009.08.003>`_.

    Note, all four values KGE, r, α, β are returned, in this order.

    :Calculation Details:
        .. math::
           E_{\\text{KGE}} = 1 - \\sqrt{[r - 1]^2 + [\\alpha - 1]^2
           + [\\beta - 1]^2}
        .. math::
           r = \\frac{\\text{cov}(e, s)}{\\sigma({e}) \\cdot \\sigma(s)}
        .. math::
           \\alpha = \\frac{\\sigma(s)}{\\sigma(e)}
        .. math::
           \\beta = \\frac{\\mu(s)}{\\mu(e)}

        where *e* is the *evaluation* series, *s* is (one of) the
        *simulations* series, *cov* is the covariance, *σ* is the
        standard deviation, and *μ* is the arithmetic mean.

    """
    # calculate error in timing and dynamics r
    # (Pearson's correlation coefficient)
    sim_mean = np.mean(simulations, axis=0, dtype=np.float64)
    obs_mean = np.mean(evaluation, dtype=np.float64)

    r_num = np.sum((simulations - sim_mean) * (evaluation - obs_mean),
                   axis=0, dtype=np.float64)
    r_den = np.sqrt(np.sum((simulations - sim_mean) ** 2,
                           axis=0, dtype=np.float64)
                    * np.sum((evaluation - obs_mean) ** 2,
                             dtype=np.float64))
    r = r_num / r_den
    # calculate error in spread of flow alpha
    alpha = np.std(simulations, axis=0) / np.std(evaluation, dtype=np.float64)
    # calculate error in volume beta (bias of mean discharge)
    beta = (np.sum(simulations, axis=0, dtype=np.float64)
            / np.sum(evaluation, dtype=np.float64))
    # calculate the Kling-Gupta Efficiency KGE
    kge_ = 1 - np.sqrt((r - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)

    return kge_, r, alpha, beta

In [None]:
def evaluation_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1]
    # nse_ = nse(y_true, y_pred)
    kge_, kge_r, kge_alpha, kge_beta = kge(y_true, y_pred)

    return rmse, mae, r2, corr, kge_, kge_r, kge_alpha, kge_beta # nse_

In [None]:
# n_neurons_hl = [128, 123, 246, 125]
# for idx, n_neurons in enumerate(n_neurons_hl):
#     print(idx, n_neurons) if n_neurons % 2 != 0 else None
#     return_sequences=True if idx != len(n_neurons_hl)-1 else False
#     print(return_sequences) 

In [None]:
# P/ n_neurons_hl = [n_neurons_hidden_layer_1, ... , n_neurons_hidden_layer_N], return_sequences=True, 
# exceto para a N-ésima camada (última).

def build_model(X_train_, n_neurons_hl, activation): 
    model = tf.keras.Sequential([ 
        tf.keras.layers.GRU(
            units=n_neurons_hl[0], 
            activation=activation,
            input_shape=[*X_train_.shape[1:]],
            return_sequences=(True if len(n_neurons_hl) > 1 else False)
        ),
        *[
            tf.keras.layers.GRU(
                units=n_neurons, 
                activation=activation,
                return_sequences=(True if idx != len(n_neurons_hl[1:])-1 else False) 
            ) for idx, n_neurons in enumerate(n_neurons_hl[1:])
        ],
        tf.keras.layers.Dense(1)
    ])

    return model

In [None]:
def train_model(model, patience, X_train_, y_train_, max_epochs, monitor_metric): 
    model.compile(loss=tf.losses.MeanAbsoluteError(),
                    optimizer=tf.optimizers.Adam(),
                    metrics=[tf.metrics.MeanAbsoluteError()])

    callbacks = [
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss', 
            patience=patience, 
            mode='min',
            restore_best_weights=True
        ),
        tf.keras.callbacks.ModelCheckpoint(
            filepath=best_file_ann, 
            monitor=monitor_metric,
            verbose=0, # True  
            save_best_only=True
        )  
    ]

    history = model.fit(
        X_train_,
        y_train_,
        epochs=max_epochs,
        verbose=0, #True
        validation_split=0.2,
        callbacks=callbacks,
    )

    # model.save(file_ann) # salva o modelo final do treinamento
    model = tf.keras.models.load_model(best_file_ann) # usamos o best_model do treinamento

    return model

In [None]:
def mlflow_run(n, f, run_name, model_params, train_params, df):
    with mlflow.start_run(run_name=run_name) as run:
        # Logging params
        mlflow.log_param("model", model_)
        mlflow.log_param("label", label)
        mlflow.log_param("n_so_retro", n)
        mlflow.log_param("f_so_pred", f)
        mlflow.log_param("seed", RANDOM_SEED)
        for key, value in model_params.items():
            mlflow.log_param(key, value)
        for key, value in train_params.items():
            mlflow.log_param(key, value)

        # Logging run_name as a tag
        mlflow.set_tag("run_name", run_name)

        # Mount dataset 
        df_ = dataset_constructor(df.copy(), f)

        new_df, scaler_y = scaling_data(df_, f)

        # Prepare X and y data and apply train_test_split
        X, y = split_sequences(new_df.values, n, f)
        
        X_train_, X_test_, y_train_, y_test_ = train_test_split(X, y, test_size=0.2, random_state=RANDOM_SEED)

        # Mount model 
        model = build_model(X_train_[:,:,1:], 
                            model_params['n_neurons_hl'], 
                            model_params['activation'])


        model = train_model(model, 
                    train_params['patience'], 
                    X_train_[:,:,1:].astype('float32'), 
                    y_train_[:,1].astype('float32'), 
                    train_params['max_epochs'],
                    train_params['monitor_metric']) # retorna o best_model


        y_pred_ = model.predict(X_test_[:,:,1:].astype('float32'))
        y_pred = scaler_y.inverse_transform(y_pred_)

        y_test = scaler_y.inverse_transform(y_test_[:,1].reshape(-1, 1))

        rmse, mae, r2, corr, kge_, kge_r, kge_alpha, kge_beta = evaluation_metrics(y_test, y_pred)
        
        # signature = mlflow.models.signature.infer_signature(X_test, y_test_)
        # mlflow.sklearn.log_model(model, "sk_models", signature=signature)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)  
        mlflow.log_metric("corr", corr)
        # mlflow.log_metric("nse", nse_)
        mlflow.log_metric("kge", kge_)
        mlflow.log_metric("kge_r", kge_r)
        mlflow.log_metric("kge_alpha", kge_alpha)
        mlflow.log_metric("kge_beta", kge_beta)

### Single Run

In [None]:
## Número de Semanas Operativas Retroativas a serem utilizadas no Treinamento dos Algoritmos. min(n)=1
n = 3

## Número da Semana Operativa Futura da Vazão a ser prevista pelos Modelos. min(f)=1
f = 1

run_name = f'single_{model_}_n={n}_f={f}_'

model_params = {
    'n_neurons_hl' : [50,60], # [n_neurons_hidden_layer_1, ... , n_neurons_hidden_layer_N]
    'activation' : 'relu'
}

train_params = {
    'patience' : 10,
    'max_epochs' : 500,
    'monitor_metric' : 'val_mean_absolute_error' 
}

df = pd.read_csv(f'{path_datasets}/{dataset}', index_col='time')

In [None]:
mlflow_run(n, f, run_name, model_params, train_params, df)

In [None]:
#mlflow.search_runs(filter_string=f"tags.run_name='{run_name}'")#.iloc[0].status

### Multi Run

In [None]:
## Número de Semanas Operativas Retroativas a serem utilizadas no Treinamento dos Algoritmos
n_range = range(1,8+1)

## Número da Semana Operativa Futura da Vazão a ser prevista pelos Modelos. min(f)=1
f_range = range(1,8+1)

# Podem variar
model_params = {}

n_neurons_hl1_range = (40,50,60) 
n_neurons_hl2_range = (40,50,60)  
n_neurons_hl3_range = (40,50,60) # fazer teste p/ 3 camadas

activation_range = ('relu', 'sigmoid') 

# São fixos
train_params = {
    'patience' : 15,
    'max_epochs' : 500,
    'monitor_metric' : 'val_mean_absolute_error' # para salvar o best_model
}

# não mecher ainda na feature_range=(-1, 1) -> mais pra tanh LSTM, GRU

df = pd.read_csv(f'{path_datasets}/{dataset}', index_col='time')

* Para 1 hidden layer

In [None]:
n_hidden_layers = '1hl'

for n in n_range:
    for f in f_range:
        for n_neurons_hl1 in n_neurons_hl1_range:
            for activation in activation_range:
                
                run_name = (
                    f"{model_}_n={n}_f={f}_"
                    f"{n_neurons_hl1}_"
                    f"{activation}_"
                    f"{n_hidden_layers}_"
                )

                # Check if the run_name already exists
                existing_runs = mlflow.search_runs(filter_string=f"tags.run_name='{run_name}'")
                if not existing_runs.empty:
                    last_run = existing_runs.iloc[0]  # Check the most recent run
                    if last_run["status"] == "FAILED":
                        print(f"Run '{run_name}' failed previously. Re-running.")
                    else:
                        print(f"Run '{run_name}' already exists. Skipping iteration.")
                        continue
                
                model_params['n_neurons_hl'] = [n_neurons_hl1]
                model_params['activation'] = activation

                mlflow_run(n, f, run_name, model_params, train_params, df)

* Para 2 hidden layers

In [None]:
n_hidden_layers = '2hl'

for n in n_range:
    for f in f_range:
        for n_neurons_hl1 in n_neurons_hl1_range:
            for n_neurons_hl2 in n_neurons_hl2_range:
                for activation in activation_range:
                    
                    run_name = (
                        f"{model_}_n={n}_f={f}_"
                        f"{n_neurons_hl1}_"
                        f"{n_neurons_hl2}_"
                        f"{activation}_"
                        f"{n_hidden_layers}_"
                    )

                    # Check if the run_name already exists
                    existing_runs = mlflow.search_runs(filter_string=f"tags.run_name='{run_name}'")
                    if not existing_runs.empty:
                        last_run = existing_runs.iloc[0]  # Check the most recent run
                        if last_run["status"] == "FAILED":
                            print(f"Run '{run_name}' failed previously. Re-running.")
                        else:
                            print(f"Run '{run_name}' already exists. Skipping iteration.")
                            continue
                    
                    model_params['n_neurons_hl'] = [n_neurons_hl1,n_neurons_hl2]
                    model_params['activation'] = activation

                    mlflow_run(n, f, run_name, model_params, train_params, df)

* Para 3 hidden layers

In [None]:
n_hidden_layers = '3hl'

for n in n_range:
    for f in f_range:
        for n_neurons_hl1 in n_neurons_hl1_range:
            for n_neurons_hl2 in n_neurons_hl2_range:
                for n_neurons_hl3 in n_neurons_hl3_range:
                    for activation in activation_range:
                        
                        run_name = (
                            f"{model_}_n={n}_f={f}_"
                            f"{n_neurons_hl1}_"
                            f"{n_neurons_hl2}_"
                            f"{n_neurons_hl3}_"
                            f"{activation}_"
                            f"{n_hidden_layers}_"
                        )

                        # Check if the run_name already exists
                        existing_runs = mlflow.search_runs(filter_string=f"tags.run_name='{run_name}'")
                        if not existing_runs.empty:
                            last_run = existing_runs.iloc[0]  # Check the most recent run
                            if last_run["status"] == "FAILED":
                                print(f"Run '{run_name}' failed previously. Re-running.")
                            else:
                                print(f"Run '{run_name}' already exists. Skipping iteration.")
                                continue
                        
                        model_params['n_neurons_hl'] = [n_neurons_hl1,n_neurons_hl2,n_neurons_hl3]
                        model_params['activation'] = activation

                        mlflow_run(n, f, run_name, model_params, train_params, df)