In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import os
import mlflow
from itertools import product



In [2]:
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Itaipu_Benchmarking_Bacia_Incremental_V1_Corrigido")

RANDOM_SEED=21
model_ = "linear_regression"
src_type = "benchmark"
label = "itaipu"

dir_results = f"../../data/results/{src_type}"
dir_figures = f"{dir_results}/figures/{model_}"

if not os.path.exists(dir_figures):
    os.makedirs(dir_figures)

path_datasets = "../../data/datasets"
dataset = "Itaipu_POC_VAZAO_V3.csv"

In [3]:
def dataset_constructor(df, n, f):
    for i in range(1, n): 
        df[f'bacia_prec_sum (time - {i})'] = df['bacia_prec_sum'].shift(i)
        df[f'vazao_itaipu (time - {i})'] = df['vazao_itaipu'].shift(i)

    df['bacia_prec_sum (time)'] = df['bacia_prec_sum']
    df['vazao_itaipu (time)'] = df['vazao_itaipu']

    for i in range(1,f+1):
        df[f'bacia_prec_sum (time + {i})'] = df['bacia_prec_sum'].shift(-i)
        
    df[f'vazao_itaipu (time + {f})'] = df['vazao_itaipu'].shift(-f)

    df = df.drop(columns=['bacia_prec_sum','vazao_itaipu'])
    df = df.dropna()
    
    return df

In [4]:
def scaling_data(df):
    scaler_X = MinMaxScaler()
    scaler_y = MinMaxScaler()

    # Columns to scale for X and y
    columns_to_scale_X = df.columns[:-1]
    columns_to_scale_y = df.columns[-1]

    # Fit scalers on the selected columns and transform
    scaled_data_X = scaler_X.fit_transform(df[columns_to_scale_X])
    scaled_data_y = scaler_y.fit_transform(df[[columns_to_scale_y]])

    # Create DataFrame with scaled data
    scaled_X = pd.DataFrame(scaled_data_X, columns=columns_to_scale_X)
    scaled_y = pd.DataFrame(scaled_data_y, columns=[columns_to_scale_y])

    # Concatenate scaled columns to the original DataFrame
    new_df = pd.concat([pd.DataFrame(df.index), scaled_X, scaled_y], axis=1)
    new_df.set_index('time', inplace=True)

    return new_df, scaler_y

In [5]:
def kge(evaluation, simulations):
    """Original Kling-Gupta Efficiency (KGE) and its three components
    (r, α, β) as per `Gupta et al., 2009
    <https://doi.org/10.1016/j.jhydrol.2009.08.003>`_.

    Note, all four values KGE, r, α, β are returned, in this order.

    :Calculation Details:
        .. math::
           E_{\\text{KGE}} = 1 - \\sqrt{[r - 1]^2 + [\\alpha - 1]^2
           + [\\beta - 1]^2}
        .. math::
           r = \\frac{\\text{cov}(e, s)}{\\sigma({e}) \\cdot \\sigma(s)}
        .. math::
           \\alpha = \\frac{\\sigma(s)}{\\sigma(e)}
        .. math::
           \\beta = \\frac{\\mu(s)}{\\mu(e)}

        where *e* is the *evaluation* series, *s* is (one of) the
        *simulations* series, *cov* is the covariance, *σ* is the
        standard deviation, and *μ* is the arithmetic mean.

    """
    # calculate error in timing and dynamics r
    # (Pearson's correlation coefficient)
    sim_mean = np.mean(simulations, axis=0, dtype=np.float64)
    obs_mean = np.mean(evaluation, dtype=np.float64)

    r_num = np.sum((simulations - sim_mean) * (evaluation - obs_mean),
                   axis=0, dtype=np.float64)
    r_den = np.sqrt(np.sum((simulations - sim_mean) ** 2,
                           axis=0, dtype=np.float64)
                    * np.sum((evaluation - obs_mean) ** 2,
                             dtype=np.float64))
    r = r_num / r_den
    # calculate error in spread of flow alpha
    alpha = np.std(simulations, axis=0) / np.std(evaluation, dtype=np.float64)
    # calculate error in volume beta (bias of mean discharge)
    beta = (np.sum(simulations, axis=0, dtype=np.float64)
            / np.sum(evaluation, dtype=np.float64))
    # calculate the Kling-Gupta Efficiency KGE
    kge_ = 1 - np.sqrt((r - 1) ** 2 + (alpha - 1) ** 2 + (beta - 1) ** 2)

    return kge_, r, alpha, beta

In [6]:
# def nse(evaluation, simulations):
#     """Nash-Sutcliffe Efficiency (NSE) as per `Nash and Sutcliffe, 1970
#     <https://doi.org/10.1016/0022-1694(70)90255-6>`_.

#     :Calculation Details:
#         .. math::
#            E_{\\text{NSE}} = 1 - \\frac{\\sum_{i=1}^{N}[e_{i}-s_{i}]^2}
#            {\\sum_{i=1}^{N}[e_{i}-\\mu(e)]^2}

#         where *N* is the length of the *simulations* and *evaluation*
#         periods, *e* is the *evaluation* series, *s* is (one of) the
#         *simulations* series, and *μ* is the arithmetic mean.

#     """
#     nse_ = 1 - (
#             np.sum((evaluation - simulations) ** 2, axis=0, dtype=np.float64)
#             / np.sum((evaluation - np.mean(evaluation)) ** 2, dtype=np.float64)
#     )

#     return nse_

In [7]:
# # Example values for evaluation and simulation
# evaluation = np.array([1, 2, 3, 4, 5, 6.2, 8])
# simulations = np.array([1.1, 2.2, 3.3, 4.4, 5.5, 7, 9])

# # Call the function and print the result
# print("Nash-Sutcliffe Efficiency:", nse(evaluation, simulations), r2_score(evaluation, simulations))

# # For the application of NSE in regression procedures (i.e. when the total sum of squares can be partitioned into error and 
# # regression components), the Nash–Sutcliffe efficiency is equivalent to the coefficient of determination (R2), thus ranging between 0 and 1.

In [8]:
def evaluation_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    corr = np.corrcoef(y_true.ravel(), y_pred.ravel())[0, 1]
    # nse_ = nse(y_true, y_pred)
    kge_, kge_r, kge_alpha, kge_beta = kge(y_true, y_pred)

    return rmse, mae, r2, corr, kge_, kge_r, kge_alpha, kge_beta # nse_

In [9]:
def mlflow_run(n, f, run_name, params, df):
    with mlflow.start_run(run_name=run_name) as run:
        # Logging params
        mlflow.log_param("model", model_)
        mlflow.log_param("label", label)
        mlflow.log_param("n_so_retro", n)
        mlflow.log_param("f_so_pred", f)
        mlflow.log_param("seed", RANDOM_SEED)
        for key, value in params.items():
            mlflow.log_param(key, value)

        # Logging run_name as a tag
        mlflow.set_tag("run_name", run_name)

        df_poc = dataset_constructor(df.copy(), n, f)

        df_poc, scaler_y = scaling_data(df_poc)

        # Prepare X and y data and apply train_test_split
        X_data = df_poc.iloc[:,:-1].astype('float64')
        y_data = df_poc.iloc[:,-1:].astype('float64')

        X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.2, random_state=RANDOM_SEED)

        # Mount model and fit it
        model = LinearRegression(**params)
        model.fit(X_train, y_train)

        # Collect the scaled and unscaled predictions # _ stands for normalized data
        y_pred_ = model.predict(X_test)
        y_pred = scaler_y.inverse_transform(y_pred_.reshape(-1,1))

        y_test_ = y_test
        y_test = scaler_y.inverse_transform(y_test)

        rmse, mae, r2, corr, kge_, kge_r, kge_alpha, kge_beta = evaluation_metrics(y_test, y_pred)
        
        # signature = mlflow.models.signature.infer_signature(X_test, y_test_)
        # mlflow.sklearn.log_model(model, "sk_models", signature=signature)

        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)  
        mlflow.log_metric("corr", corr)
        # mlflow.log_metric("nse", nse_)
        mlflow.log_metric("kge", kge_)
        mlflow.log_metric("kge_r", kge_r)
        mlflow.log_metric("kge_alpha", kge_alpha)
        mlflow.log_metric("kge_beta", kge_beta)

### Single run

In [37]:
## Número de Semanas Operativas Retroativas a serem utilizadas no Treinamento dos Algoritmos. min(n)=1
n = 5

## Número da Semana Operativa Futura da Vazão a ser prevista pelos Modelos. min(f)=1
f = 2

params = {
    'fit_intercept': True,
    # 'normalize': False,
}

run_name = f"single_{model_}_n={n}_f={f}_fitIn={params['fit_intercept']}_"

df = pd.read_csv(f'{path_datasets}/{dataset}', index_col='time')

In [38]:
mlflow_run(n, f, run_name, params, df)

### Multi run

In [10]:
## Número de Semanas Operativas Retroativas a serem utilizadas no Treinamento dos Algoritmos
n_range = range(1,8+1)

## Número da Semana Operativa Futura da Vazão a ser prevista pelos Modelos. min(f)=1
f_range = range(1,8+1)


params_grid = {
    'fit_intercept': [True, False],
    # 'normalize': [True, False],
}

# Generate all possible combinations and create a list of dictionaries representing each combination
params_combinations = list(product(*params_grid.values()))
params_list = [dict(zip(params_grid.keys(), combination)) for combination in params_combinations]

df = pd.read_csv(f'{path_datasets}/{dataset}', index_col='time')

In [11]:
for n in n_range:
    for f in f_range:
        for params in params_list:

            run_name = (
                f"{model_}_n={n}_f={f}_"
                f"fit_intercept={params['fit_intercept']}_"
            ) 

            # Check if the run_name already exists
            existing_runs = mlflow.search_runs(filter_string=f"tags.run_name='{run_name}'")
            if not existing_runs.empty:
                last_run = existing_runs.iloc[0]  # Check the most recent run
                if last_run["status"] == "FAILED":
                    print(f"Run '{run_name}' failed previously. Re-running.")
                else:
                    print(f"Run '{run_name}' already exists. Skipping iteration.")
                    continue

            mlflow_run(n, f, run_name, params, df)
