In [1]:
import pandas as pd
import numpy as np
import math
import plotly.express as px
import plotly.graph_objects as go
import tensorflow as tf
import random
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

random.seed(123)
np.random.seed(123)
tf.random.set_seed(123)

# Model training

In [2]:
# Select station to train the models
stationCode = 'CR12'
stationPath = './all data murcia/' + stationCode + '.csv'

In [3]:
def convertirComa(x):
    if type(x) == str:
        return x.replace(",", ".")
    else:
        return x
def leerEstacionDatos(path):
    estacionDatas = pd.read_csv(path, encoding='ISO-8859-1', sep=";")
    estacionDatas.columns = ['ESTACION', 'MUNICIPIO', 'PARAJE', 'HORAS', 'FECHA', 'ETO','TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED','VVMED', '-']
    estacionDatas = estacionDatas.drop(columns=['ESTACION', 'MUNICIPIO', 'PARAJE', 'HORAS', '-'])
    estacionDatas = estacionDatas.reset_index().drop(columns='index')
    estacionDatas['FECHA'] = pd.to_datetime(estacionDatas['FECHA'], format="%d/%m/%y")
    estacionDatas.index = estacionDatas['FECHA']
    estacionDatas.drop(columns='FECHA', inplace=True)
    estacionDatas.dropna(inplace=True)
    for i in estacionDatas.columns:
        estacionDatas[i] = pd.to_numeric(estacionDatas[i].apply(lambda x : convertirComa(x)))
    return estacionDatas
estacionDatas = leerEstacionDatos(stationPath)
estacionDatas

Unnamed: 0_level_0,ETO,TMAX,TMIN,HRMAX,HRMIN,RADMED,VVMED
FECHA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2010-01-01,1.91,8.88,3.15,80.30,50.12,125.33,5.95
2010-01-02,1.23,12.53,3.22,87.70,44.93,118.40,1.64
2010-01-03,0.99,11.38,0.65,90.40,52.09,84.07,1.34
2010-01-04,0.74,10.93,4.41,94.10,74.00,51.77,1.22
2010-01-05,1.38,11.95,5.89,84.10,57.20,93.29,2.60
...,...,...,...,...,...,...,...
2023-12-27,1.54,16.03,-3.11,69.03,20.29,121.79,1.25
2023-12-28,1.43,15.64,-0.03,65.53,22.35,81.92,1.07
2023-12-29,1.28,13.69,4.56,48.82,25.71,41.22,0.83
2023-12-30,1.49,16.54,1.22,59.17,18.05,118.04,1.11


In [4]:
import plotly.io as pio
pio.templates.default = "plotly"

# Graphic of each variable
fig = go.Figure()
for c in estacionDatas.columns:
    fig.add_trace(go.Scatter(x= estacionDatas.index, y=estacionDatas[c],
                        name=c, mode='lines'))
fig.show()

## Select dates for model training and validation

In [5]:
estacionDatas = estacionDatas[estacionDatas.index <= '2023-06-17']

# The scaler will be used later
StationScaler = StandardScaler()
# The output is not scaled
estacionDatas_scaled = estacionDatas.drop(columns='ETO')
estacionDatas_scaled = pd.DataFrame(StationScaler.fit_transform(estacionDatas_scaled), index=estacionDatas_scaled.index, columns=estacionDatas_scaled.columns)
estacionDatas_scaled

Unnamed: 0_level_0,TMAX,TMIN,HRMAX,HRMIN,RADMED,VVMED
FECHA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,-1.354227,-0.712006,0.027038,0.758373,-0.840810,3.941095
2010-01-02,-0.895870,-0.700295,0.616886,0.464314,-0.917580,-0.216367
2010-01-03,-1.040284,-1.130288,0.832101,0.869991,-1.297883,-0.505750
2010-01-04,-1.096794,-0.501192,1.127025,2.111388,-1.655698,-0.621503
2010-01-05,-0.968705,-0.253570,0.329933,1.159518,-1.195745,0.709657
...,...,...,...,...,...,...
2023-06-13,0.568359,0.937696,1.213111,-0.199729,0.824526,-0.544334
2023-06-14,0.653751,0.880810,1.589339,-0.033151,1.208707,-0.602211
2023-06-15,0.984019,0.875791,1.294415,-0.195763,1.698902,-0.708317
2023-06-16,1.105829,0.964466,1.589339,-0.481323,1.537276,-0.534688


In [6]:
import plotly.io as pio
pio.templates.default = "simple_white"
corr = estacionDatas.corr(method='pearson')
mask = np.triu(np.ones_like(corr, dtype=bool))
corr = corr.mask(mask)
corr = round(corr,2)
corr.columns = ['ET0', 'Tmax', 'Tmin', 'HRmax', 'HRmin', 'Rs', 'U2']
corr.index = ['ET0', 'Tmax', 'Tmin', 'HRmax', 'HRmin', 'Rs', 'U2']
fig = px.imshow(corr, text_auto=True)

fig.update_coloraxes(showscale=False)
fig.update_layout(
        title={
            'text': '<b>Correlation of variables ('+stationCode+') </b>',
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'})
fig.show()

In [7]:
# Split train and test set
train = estacionDatas[estacionDatas.index < '2020-01-01']
test = estacionDatas[estacionDatas.index >= '2020-01-01']
print(train)
print(test)

             ETO   TMAX  TMIN  HRMAX  HRMIN  RADMED  VVMED
FECHA                                                     
2010-01-01  1.91   8.88  3.15  80.30  50.12  125.33   5.95
2010-01-02  1.23  12.53  3.22  87.70  44.93  118.40   1.64
2010-01-03  0.99  11.38  0.65  90.40  52.09   84.07   1.34
2010-01-04  0.74  10.93  4.41  94.10  74.00   51.77   1.22
2010-01-05  1.38  11.95  5.89  84.10  57.20   93.29   2.60
...          ...    ...   ...    ...    ...     ...    ...
2019-12-27  1.01  14.16  2.69  87.05  50.22  110.60   1.11
2019-12-28  1.25  16.83 -0.13  92.30  33.96  118.37   1.08
2019-12-29  0.70  10.39 -2.26  92.90  60.67  104.13   0.86
2019-12-30  0.71   9.81 -0.85  93.42  63.84   89.49   0.98
2019-12-31  0.75  10.41 -1.78  92.58  66.55  119.84   1.20

[3651 rows x 7 columns]
             ETO   TMAX   TMIN  HRMAX  HRMIN  RADMED  VVMED
FECHA                                                      
2020-01-01  1.30  15.20  -1.02  92.20  32.44  119.84   1.22
2020-01-02  0.91  10.71  -1.

## Create all possible combinations of input variables

In [9]:
from itertools import combinations

# List of strings
strings = ['T', 'HR', 'RADMED', 'VVMED']

# Get all unique combinations of the strings
all_combinations = set()

# Loop over different combination lengths
for r in range(1, len(strings) + 1):
    # Generate combinations of length r
    combinations_r = combinations(strings, r)
    # Add unique combinations to the set
    all_combinations.update(combinations_r)

# Convert the combinations to lists 
all = [list(combination) for combination in all_combinations]

all_combinations = []

# Print all unique combinations
for combination in all:
    comb = []
    for c in combination:
        if c == 'T':
            comb.append('TMAX')
            comb.append('TMIN')
        elif c == 'HR':
            comb.append('HRMAX')
            comb.append('HRMIN')
        else:
            comb.append(c)
    all_combinations.append(comb)
all_combinations

[['HRMAX', 'HRMIN', 'VVMED'],
 ['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED'],
 ['TMAX', 'TMIN', 'HRMAX', 'HRMIN'],
 ['RADMED'],
 ['RADMED', 'VVMED'],
 ['HRMAX', 'HRMIN', 'RADMED', 'VVMED'],
 ['TMAX', 'TMIN', 'VVMED'],
 ['HRMAX', 'HRMIN', 'RADMED'],
 ['TMAX', 'TMIN'],
 ['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED'],
 ['HRMAX', 'HRMIN'],
 ['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED'],
 ['VVMED'],
 ['TMAX', 'TMIN', 'RADMED', 'VVMED'],
 ['TMAX', 'TMIN', 'RADMED']]

## SVR

CI42 Results

In [251]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# define search space
param_vals = {'kernel': ['rbf'],
              'C': [0.01, 0.1, 1, 10, 50, 100, 1000],
              'gamma': [0.1, 1, 10, 20, 50],
              'epsilon': [0.01, 0.1, 1, 10] }
# define score metrics
scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

# lists to save the results
all_medidas = []
svrCV5results = []

# iteration for each input combination
for comb in all_combinations:
    print(comb)

    # scale the original values of each combination
    scaler = StandardScaler().fit(estacionDatas[comb])

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']
    
    # Fit and transform the training features
    X_train_scaled = scaler.transform(X_train)

    # Fit and transform the test features
    X_test_scaled = scaler.transform(test[comb])
    y_test = test['ETO']

    # define the search
    searchSVR = RandomizedSearchCV(estimator=SVR() , param_distributions=param_vals, 
                                    n_jobs=-1, cv=5, verbose=3, n_iter=60,
                                    scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchSVR.fit(X_train_scaled, y_train)

    # Make predictions from X_test datas
    y_pred = searchSVR.best_estimator_.predict(X_test_scaled) 

    # Save the best estimator
    svrCV5results.append(pd.DataFrame(searchSVR.cv_results_).iloc[searchSVR.best_index_])

    # Save each measurements (stastistical indicadors)
    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchSVR.best_estimator_))
    medidas.append(searchSVR.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas.append(medidas)

# Final dataframe results
all_medidas_svr = pd.DataFrame(all_medidas, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_svr

['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['HRMAX', 'HRMIN']","SVR(C=10, epsilon=0.01, gamma=0.1)",-1.250337,0.3132,1.323475,71.473502,1.660291
1,"['TMAX', 'TMIN']","SVR(C=1, epsilon=0.01, gamma=1)",-0.827697,0.764168,0.778981,41.52294,1.043562
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']","SVR(C=100, gamma=0.1)",-0.173682,0.989347,0.153902,7.658692,0.212185
3,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']","SVR(C=10, gamma=0.1)",-0.420001,0.925318,0.409938,16.242705,0.535226
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...","SVR(C=10, epsilon=0.01, gamma=0.1)",-0.131295,0.990029,0.147293,7.38799,0.207794
5,['RADMED'],"SVR(C=50, gamma=1)",-0.532014,0.904779,0.545618,27.643795,0.696031
6,"['RADMED', 'VVMED']","SVR(C=1, epsilon=0.01, gamma=1)",-0.43779,0.922225,0.416871,16.576051,0.55495
7,['VVMED'],"SVR(C=1, epsilon=0.01, gamma=10)",-1.27718,0.310469,1.378539,48.293809,1.809674
8,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']","SVR(C=10, epsilon=0.01, gamma=0.1)",-0.244328,0.977038,0.261656,12.505836,0.340103
9,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']","SVR(C=10, gamma=0.1)",-0.619707,0.801367,0.680936,28.845473,0.887321


CA91 Results

In [259]:
# CA91
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# define search space
param_vals = {'kernel': ['rbf'],
              'C': [0.01, 0.1, 1, 10, 50, 100, 1000],
              'gamma': [0.1, 1, 10, 20, 50],
              'epsilon': [0.01, 0.1, 1, 10] }
# define score metrics
scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

# lists to save the results
all_medidas = []
svrCV5results = []

# iteration for each input combination
for comb in all_combinations:
    print(comb)

    # scale the original values of each combination
    scaler = StandardScaler().fit(estacionDatas[comb])

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']
    
    # Fit and transform the training features
    X_train_scaled = scaler.transform(X_train)

    # Fit and transform the test features
    X_test_scaled = scaler.transform(test[comb])
    y_test = test['ETO']

    # define the search
    searchSVR = RandomizedSearchCV(estimator=SVR() , param_distributions=param_vals, 
                                    n_jobs=-1, cv=5, verbose=3, n_iter=60,
                                    scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchSVR.fit(X_train_scaled, y_train)

    # Make predictions from X_test datas
    y_pred = searchSVR.best_estimator_.predict(X_test_scaled) 

    # Save the best estimator
    svrCV5results.append(pd.DataFrame(searchSVR.cv_results_).iloc[searchSVR.best_index_])

    # Save each measurements (stastistical indicadors)
    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchSVR.best_estimator_))
    medidas.append(searchSVR.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas.append(medidas)

# Final dataframe results
all_medidas_svr = pd.DataFrame(all_medidas, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_svr

['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['HRMAX', 'HRMIN']","SVR(C=1, epsilon=0.01, gamma=1)",-1.251751,0.279105,1.230477,65.320396,1.556762
1,"['TMAX', 'TMIN']","SVR(C=1, epsilon=0.01, gamma=1)",-0.77447,0.6971,0.761399,35.847912,1.010146
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']","SVR(C=100, gamma=0.1)",-0.140454,0.98619,0.142012,6.940986,0.209494
3,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']","SVR(C=10, gamma=0.1)",-0.364565,0.922219,0.361222,14.687555,0.498086
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...","SVR(C=10, epsilon=0.01, gamma=0.1)",-0.131528,0.986053,0.141536,6.710362,0.212235
5,['RADMED'],"SVR(C=1, epsilon=0.01, gamma=10)",-0.434926,0.908044,0.419666,18.390873,0.547589
6,"['RADMED', 'VVMED']","SVR(C=100, gamma=0.1)",-0.386121,0.912988,0.393363,15.84828,0.53367
7,['VVMED'],"SVR(C=50, epsilon=1, gamma=0.1)",-1.362891,0.213738,1.302809,58.925979,1.556665
8,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']","SVR(C=100, gamma=0.1)",-0.237858,0.976046,0.205731,10.328981,0.277168
9,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']","SVR(C=100, gamma=0.1)",-0.665912,0.693301,0.762115,33.607499,1.030097


CR12 Results

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVR
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

# define search space
param_vals = {'kernel': ['rbf'],
              'C': [0.01, 0.1, 1, 10, 50, 100, 1000],
              'gamma': [0.1, 1, 10, 20, 50],
              'epsilon': [0.01, 0.1, 1, 10] }
# define score metrics
scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

# lists to save the results
all_medidas = []
svrCV5results = []

# iteration for each input combination
for comb in all_combinations:
    print(comb)

    # scale the original values of each combination
    scaler = StandardScaler().fit(estacionDatas[comb])

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']
    
    # Fit and transform the training features
    X_train_scaled = scaler.transform(X_train)

    # Fit and transform the test features
    X_test_scaled = scaler.transform(test[comb])
    y_test = test['ETO']

    # define the search
    searchSVR = RandomizedSearchCV(estimator=SVR() , param_distributions=param_vals, 
                                    n_jobs=-1, cv=5, verbose=3, n_iter=60,
                                    scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchSVR.fit(X_train_scaled, y_train)

    # Make predictions from X_test datas
    y_pred = searchSVR.best_estimator_.predict(X_test_scaled) 

    # Save the best estimator
    svrCV5results.append(pd.DataFrame(searchSVR.cv_results_).iloc[searchSVR.best_index_])

    # Save each measurements (stastistical indicadors)
    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchSVR.best_estimator_))
    medidas.append(searchSVR.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas.append(medidas)

# Final dataframe results
all_medidas_svr = pd.DataFrame(all_medidas, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_svr

['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names



['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']","SVR(C=10, gamma=0.1)",-0.547157,0.82279,0.61473,24.277881,0.821724
1,"['TMAX', 'TMIN', 'VVMED']","SVR(C=10, epsilon=0.01, gamma=0.1)",-0.491244,0.855545,0.518819,19.380846,0.675516
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']","SVR(C=100, gamma=0.1)",-0.166101,0.985171,0.152325,7.373019,0.21196
3,"['HRMAX', 'HRMIN', 'RADMED']","SVR(C=100, gamma=0.1)",-0.456988,0.897443,0.453605,18.585432,0.5989
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']","SVR(C=10, gamma=0.1)",-0.249134,0.975032,0.261024,10.437008,0.34459
5,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...","SVR(C=10, epsilon=0.01, gamma=0.1)",-0.119944,0.990043,0.121551,5.908216,0.172262
6,['RADMED'],"SVR(C=1, epsilon=0.01, gamma=1)",-0.53907,0.859302,0.513156,21.790871,0.673029
7,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']","SVR(C=10, gamma=0.1)",-0.421661,0.903222,0.403386,16.443159,0.552618
8,"['HRMAX', 'HRMIN', 'VVMED']","SVR(C=10, gamma=0.1)",-0.97098,0.460004,1.049757,46.516448,1.353655
9,"['TMAX', 'TMIN']","SVR(C=50, gamma=1)",-0.689593,0.792114,0.663082,29.220801,0.864812


In [101]:
svrCV5results = pd.DataFrame(svrCV5results)
svrCV5results['Combination'] = all_medidas_svr['combination'].values
svrCV5results = svrCV5results.reset_index().drop(columns=['index', 'params'])
svrCV5results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_gamma,param_epsilon,param_C,split0_test_neg_mean_absolute_percentage_error,split1_test_neg_mean_absolute_percentage_error,...,rank_test_r2,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,split3_test_neg_mean_squared_error,split4_test_neg_mean_squared_error,mean_test_neg_mean_squared_error,std_test_neg_mean_squared_error,rank_test_neg_mean_squared_error,Combination
0,0.458804,0.004047,0.225151,0.003502,rbf,0.1,0.01,10,-0.534457,-0.530338,...,8,-2.501525,-2.473889,-2.234046,-2.650956,-2.529292,-2.477942,0.136109,8,"['HRMAX', 'HRMIN']"
1,0.45308,0.00545,0.225136,0.003381,rbf,1.0,0.01,1,-0.300307,-0.311245,...,6,-0.753417,-1.21961,-1.170657,-1.194758,-1.201628,-1.108014,0.177991,5,"['TMAX', 'TMIN']"
2,0.815771,0.02235,0.231934,0.012533,rbf,0.1,0.01,10,-0.061712,-0.05927,...,2,-0.049608,-0.055059,-0.058083,-0.085698,-0.048504,-0.05939,0.013615,2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']"
3,0.545956,0.011065,0.194491,0.003055,rbf,0.1,0.1,10,-0.127956,-0.146932,...,1,-0.230425,-0.349762,-0.293292,-0.370991,-0.31957,-0.312808,0.048917,1,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']"
4,1.464843,0.040901,0.224545,0.005166,rbf,0.1,0.01,10,-0.052186,-0.046914,...,1,-0.031966,-0.029527,-0.033534,-0.043757,-0.040819,-0.035921,0.005434,1,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '..."
5,0.522405,0.015903,0.194665,0.002896,rbf,1.0,0.1,50,-0.175495,-0.186411,...,5,-0.342981,-0.435679,-0.456776,-0.579365,-0.52895,-0.46875,0.081135,5,['RADMED']
6,2.837434,0.066177,0.188568,0.005244,rbf,0.1,0.1,1000,-0.135227,-0.143098,...,7,-0.268155,-0.339307,-0.322409,-0.471998,-0.360856,-0.352545,0.067146,7,"['RADMED', 'VVMED']"
7,0.498149,0.017386,0.212383,0.004222,rbf,1.0,0.1,50,-0.67581,-0.527003,...,26,-3.033205,-2.435348,-2.732494,-2.570796,-2.774794,-2.709327,0.201978,26,['VVMED']
8,0.784973,0.019108,0.235741,0.004209,rbf,0.1,0.01,10,-0.089588,-0.092848,...,1,-0.076188,-0.09253,-0.119625,-0.134331,-0.124987,-0.109533,0.02171,1,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']"
9,0.500475,0.016205,0.207628,0.003841,rbf,0.1,0.1,10,-0.212993,-0.219571,...,1,-0.560288,-0.761059,-0.674865,-0.676996,-0.715144,-0.677671,0.06655,1,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']"


## Random Forest

CI42 Results

In [252]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error


# define search space
param_vals = {'max_depth': range(2, 200, 2),
              'max_features': [None, 'sqrt', 'log2'],
              'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
              'n_estimators': range(10, 2000, 10)}

scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

all_medidas_rf = []
rfCV5results = []
for comb in all_combinations:
    print(comb)

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']

    X_test = test[comb]
    y_test = test['ETO']

    # define the search
    searchRF = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=123),  param_distributions=param_vals, 
                             n_jobs=-1, cv=5, verbose=3, n_iter=60,
                             scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchRF.fit(X_train, y_train)

    rfCV5results.append(pd.DataFrame(searchRF.cv_results_).iloc[searchRF.best_index_])

    # Make predictions from X_test datas
    y_pred = searchRF.best_estimator_.predict(X_test) 

    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchRF.best_estimator_))
    medidas.append(searchRF.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas_rf.append(medidas)

all_medidas_rf = pd.DataFrame(all_medidas_rf, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_rf

['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X has feature names, but RandomForestRegressor was fitted without feature names



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X has feature names, but RandomForestRegressor was fitted without feature names



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['HRMAX', 'HRMIN']",RandomForestRegressor(criterion='friedman_mse'...,-1.275493,0.302972,1.344377,75.244546,1.643244
1,"['TMAX', 'TMIN']",RandomForestRegressor(criterion='friedman_mse'...,-0.858029,0.76529,0.813884,46.726153,1.018367
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']",RandomForestRegressor(criterion='absolute_erro...,-0.198815,0.986411,0.179846,8.567782,0.235921
3,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']",RandomForestRegressor(criterion='friedman_mse'...,-0.431737,0.922606,0.41803,16.714964,0.542151
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...","RandomForestRegressor(criterion='poisson', max...",-0.179864,0.985973,0.17443,8.988709,0.228974
5,['RADMED'],RandomForestRegressor(criterion='friedman_mse'...,-0.541246,0.904717,0.548557,29.625433,0.688957
6,"['RADMED', 'VVMED']",RandomForestRegressor(criterion='friedman_mse'...,-0.462994,0.919598,0.428207,18.418371,0.548468
7,['VVMED'],RandomForestRegressor(criterion='friedman_mse'...,-1.311718,0.337136,1.315975,48.727346,1.692207
8,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']","RandomForestRegressor(criterion='poisson', max...",-0.268081,0.974243,0.299359,15.562442,0.376676
9,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']","RandomForestRegressor(max_depth=10, max_featur...",-0.643787,0.794853,0.704723,32.105724,0.908081


CA91 Results

In [260]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error


# define search space
param_vals = {'max_depth': range(2, 200, 2),
              'max_features': [None, 'sqrt', 'log2'],
              'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
              'n_estimators': range(10, 2000, 10)}

scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

all_medidas_rf = []
rfCV5results = []
for comb in all_combinations:
    print(comb)

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']

    X_test = test[comb]
    y_test = test['ETO']

    # define the search
    searchRF = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=123),  param_distributions=param_vals, 
                             n_jobs=-1, cv=5, verbose=3, n_iter=60,
                             scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchRF.fit(X_train, y_train)

    rfCV5results.append(pd.DataFrame(searchRF.cv_results_).iloc[searchRF.best_index_])

    # Make predictions from X_test datas
    y_pred = searchRF.best_estimator_.predict(X_test) 

    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchRF.best_estimator_))
    medidas.append(searchRF.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas_rf.append(medidas)

all_medidas_rf = pd.DataFrame(all_medidas_rf, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_rf

['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X has feature names, but RandomForestRegressor was fitted without feature names



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X has feature names, but RandomForestRegressor was fitted without feature names



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['HRMAX', 'HRMIN']",RandomForestRegressor(criterion='friedman_mse'...,-1.266077,0.266607,1.249823,64.394995,1.500577
1,"['TMAX', 'TMIN']",RandomForestRegressor(criterion='friedman_mse'...,-0.809637,0.709346,0.771652,38.065712,0.964626
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']","RandomForestRegressor(criterion='poisson', max...",-0.161479,0.984406,0.153993,7.674905,0.216949
3,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']","RandomForestRegressor(max_depth=10, max_featur...",-0.386922,0.920794,0.368966,15.219851,0.490676
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...","RandomForestRegressor(criterion='poisson', max...",-0.16459,0.98435,0.157911,7.516557,0.217937
5,['RADMED'],RandomForestRegressor(criterion='friedman_mse'...,-0.444828,0.906957,0.431554,20.288506,0.539804
6,"['RADMED', 'VVMED']",RandomForestRegressor(criterion='friedman_mse'...,-0.41186,0.912227,0.402291,17.591459,0.520542
7,['VVMED'],RandomForestRegressor(criterion='friedman_mse'...,-1.364876,0.230216,1.281178,57.373703,1.538443
8,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']","RandomForestRegressor(criterion='poisson', max...",-0.251101,0.974882,0.224905,11.909464,0.292477
9,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']",RandomForestRegressor(criterion='friedman_mse'...,-0.688376,0.712564,0.752363,34.992866,0.954068


CR12 Results

In [10]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import  mean_absolute_error, mean_squared_error, mean_absolute_percentage_error


# define search space
param_vals = {'max_depth': range(2, 200, 2),
              'max_features': [None, 'sqrt', 'log2'],
              'criterion': ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
              'n_estimators': range(10, 2000, 10)}

scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

all_medidas_rf = []
rfCV5results = []
for comb in all_combinations:
    print(comb)

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']

    X_test = test[comb]
    y_test = test['ETO']

    # define the search
    searchRF = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=123),  param_distributions=param_vals, 
                             n_jobs=-1, cv=5, verbose=3, n_iter=60,
                             scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchRF.fit(X_train, y_train)

    rfCV5results.append(pd.DataFrame(searchRF.cv_results_).iloc[searchRF.best_index_])

    # Make predictions from X_test datas
    y_pred = searchRF.best_estimator_.predict(X_test) 

    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchRF.best_estimator_))
    medidas.append(searchRF.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas_rf.append(medidas)

all_medidas_rf = pd.DataFrame(all_medidas_rf, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_rf

['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X has feature names, but RandomForestRegressor was fitted without feature names



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X has feature names, but RandomForestRegressor was fitted without feature names



['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits


Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['HRMAX', 'HRMIN', 'VVMED']","RandomForestRegressor(max_depth=10, max_featur...",-1.008819,0.418197,1.051136,43.727054,1.329724
1,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']",RandomForestRegressor(criterion='absolute_erro...,-0.264459,0.969998,0.285009,11.76667,0.369934
2,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']","RandomForestRegressor(max_depth=10, max_featur...",-0.561688,0.822309,0.601344,24.482,0.789482
3,['RADMED'],RandomForestRegressor(criterion='friedman_mse'...,-0.550873,0.857724,0.530034,23.96466,0.679478
4,"['RADMED', 'VVMED']",RandomForestRegressor(criterion='friedman_mse'...,-0.527282,0.868991,0.486259,21.582376,0.634074
5,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']",RandomForestRegressor(criterion='friedman_mse'...,-0.442669,0.899769,0.418396,17.248959,0.560408
6,"['TMAX', 'TMIN', 'VVMED']","RandomForestRegressor(max_depth=10, max_featur...",-0.519749,0.844236,0.520879,20.033548,0.686917
7,"['HRMAX', 'HRMIN', 'RADMED']","RandomForestRegressor(max_depth=10, max_featur...",-0.474873,0.891137,0.469831,19.724831,0.613598
8,"['TMAX', 'TMIN']",RandomForestRegressor(criterion='friedman_mse'...,-0.720021,0.793329,0.671626,33.266468,0.833872
9,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']","RandomForestRegressor(max_depth=24, max_featur...",-0.471223,0.858298,0.488038,18.221125,0.649656


In [83]:
rfCV5results = pd.DataFrame(rfCV5results)
rfCV5results['Combination'] = all_medidas_rf['combination'].values
rfCV5results = rfCV5results.reset_index().drop(columns=['index', 'params'])
rfCV5results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,param_criterion,split0_test_neg_mean_absolute_percentage_error,split1_test_neg_mean_absolute_percentage_error,...,rank_test_r2,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,split3_test_neg_mean_squared_error,split4_test_neg_mean_squared_error,mean_test_neg_mean_squared_error,std_test_neg_mean_squared_error,rank_test_neg_mean_squared_error,Combination
0,1.734937,0.025366,0.046101,0.002571,550,,4,friedman_mse,-0.562684,-0.552975,...,1,-2.366347,-2.359556,-2.249038,-2.548141,-2.538919,-2.4124,0.11492,1,"['HRMAX', 'HRMIN']"
1,3.000826,0.041641,0.108211,0.00743,1090,log2,6,friedman_mse,-0.335791,-0.332395,...,1,-0.72266,-1.141539,-1.110227,-1.215989,-1.187504,-1.075584,0.180192,1,"['TMAX', 'TMIN']"
2,98.35049,0.465463,0.28399,0.020245,1620,log2,22,absolute_error,-0.066583,-0.062681,...,10,-0.049925,-0.059354,-0.078399,-0.0977,-0.076928,-0.072461,0.016564,11,"['TMAX', 'TMIN', 'RADMED', 'VVMED']"
3,2.826967,0.023423,0.058368,0.004372,390,sqrt,12,friedman_mse,-0.146492,-0.148501,...,1,-0.254048,-0.310335,-0.298237,-0.362819,-0.337714,-0.312631,0.036857,1,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']"
4,21.139462,0.287887,0.295922,0.003899,1700,log2,140,poisson,-0.067463,-0.057539,...,4,-0.040106,-0.041406,-0.062413,-0.085638,-0.064372,-0.058787,0.016831,5,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '..."
5,1.316595,0.013823,0.044807,0.005531,550,,4,friedman_mse,-0.188803,-0.19692,...,1,-0.338678,-0.42697,-0.455162,-0.568672,-0.534973,-0.464891,0.081418,1,['RADMED']
6,0.747157,0.003639,0.0298,0.000826,290,sqrt,6,friedman_mse,-0.175095,-0.159695,...,1,-0.292779,-0.339986,-0.328082,-0.427673,-0.357113,-0.349127,0.044575,1,"['RADMED', 'VVMED']"
7,0.971478,0.0127,0.045485,0.004697,550,,4,friedman_mse,-0.703876,-0.548164,...,1,-2.786028,-2.295112,-2.54343,-2.616611,-2.552611,-2.558758,0.157975,1,['VVMED']
8,21.915185,0.183188,0.298755,0.012481,1700,log2,140,poisson,-0.097374,-0.095436,...,5,-0.085835,-0.097957,-0.136614,-0.165921,-0.155249,-0.128315,0.031416,5,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']"
9,4.425923,0.02435,0.053548,0.002642,410,,10,squared_error,-0.216725,-0.229313,...,4,-0.532918,-0.764247,-0.678568,-0.73362,-0.744801,-0.690831,0.083929,4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']"


##  MLP

CI42 Results

In [253]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping 
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from scikeras.wrappers import KerasRegressor

# define search space
param_vals = {    
    "first_layer": range(12, 1024, 12),
    "second_layer": range(4, 256, 4),
    "dropout": [0, 0.2, 0.4, 0.6, 0.8],
    "loss": ["mse", "mae"],
    "optimizer": ["adam", "sgd", "RMSprop"],
    "optimizer__learning_rate": [0.0001, 0.001, 0.01, 0.1],
    "activation": ["relu", "tanh"],
    "batch_size": [32, 64, 128]
    }

def get_model(first_layer, second_layer, dropout, activation, meta):

    model = Sequential() 
    model.add(Dense(first_layer, activation = activation, input_shape = (X_train_scaled.shape[1], ))) 
    model.add(Dropout(dropout))
    model.add(Dense(second_layer, activation = activation)) 
    model.add(Dense(1))
    return model

scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

all_medidas = []
mlpCV5results = []
for comb in all_combinations:

    print(comb)
    scaler = StandardScaler().fit(estacionDatas[comb])

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']
    
    # Fit and transform the training features
    X_train_scaled = scaler.transform(X_train)

    # Fit and transform the test features
    X_test_scaled = scaler.transform(test[comb])
    y_test = test['ETO']

    modelMLP = KerasRegressor(
        get_model,
        metrics = ['mean_absolute_error'],
        loss=None,
        first_layer = None,
        second_layer= None,
        dropout = None,
        activation=None,
        batch_size=None, 
        optimizer=None,
        epochs = 500, 
        verbose = 0, 
        validation_split = 0.2, 
        callbacks = [EarlyStopping(monitor = 'val_loss', patience = 30)],
        random_state=123
    )

    # define the search
    searchMLP = RandomizedSearchCV(estimator=modelMLP, param_distributions=param_vals, 
                                    n_jobs=-1, cv=5, verbose=3, n_iter=60,
                                    scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchMLP.fit(X_train_scaled, y_train)

    # Make predictions from X_test datas
    y_pred = searchMLP.best_estimator_.predict(X_test_scaled)

    mlpCV5results.append(pd.DataFrame(searchMLP.cv_results_).iloc[searchMLP.best_index_])

    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchMLP.best_estimator_))
    medidas.append(searchMLP.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas.append(medidas)

all_medidas_mlp = pd.DataFrame(all_medidas, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_mlp

['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['HRMAX', 'HRMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-1.247192,0.302903,1.332872,71.778177,1.668011
1,"['TMAX', 'TMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-0.835169,0.764223,0.81864,45.212211,1.082695
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.170965,0.989972,0.153631,7.433992,0.202036
3,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.417982,0.920088,0.434815,17.097319,0.555174
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...",KerasRegressor(\n\tmodel=<function get_model a...,-0.135892,0.989086,0.14971,7.592785,0.209874
5,['RADMED'],KerasRegressor(\n\tmodel=<function get_model a...,-0.536547,0.903586,0.547268,25.229557,0.703841
6,"['RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.445806,0.922279,0.420656,17.407109,0.54587
7,['VVMED'],KerasRegressor(\n\tmodel=<function get_model a...,-1.275159,0.299295,1.401888,47.935625,1.847958
8,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.245623,0.97369,0.294625,15.056935,0.385191
9,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-0.622679,0.7887,0.695361,28.278537,0.903107


CA91 Results

In [11]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping 
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import RandomizedSearchCV

# define search space
param_vals = {    
    "first_layer": range(12, 1024, 12),
    "second_layer": range(4, 256, 4),
    "dropout": [0, 0.2, 0.4, 0.6, 0.8],
    "loss": ["mse", "mae"],
    "optimizer": ["adam", "sgd", "RMSprop"],
    "optimizer__learning_rate": [0.0001, 0.001, 0.01, 0.1],
    "activation": ["relu", "tanh"],
    "batch_size": [32, 64, 128]
    }

def get_model(first_layer, second_layer, dropout, activation, meta):

    model = Sequential() 
    model.add(Dense(first_layer, activation = activation, input_shape = (X_train_scaled.shape[1], ))) 
    model.add(Dropout(dropout))
    model.add(Dense(second_layer, activation = activation)) 
    model.add(Dense(1))
    return model

scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

all_medidas = []
mlpCV5results = []
for comb in all_combinations:

    print(comb)
    scaler = StandardScaler().fit(estacionDatas[comb])

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']
    
    # Fit and transform the training features
    X_train_scaled = scaler.transform(X_train)

    # Fit and transform the test features
    X_test_scaled = scaler.transform(test[comb])
    y_test = test['ETO']

    modelMLP = KerasRegressor(
        get_model,
        metrics = ['mean_absolute_error'],
        loss=None,
        first_layer = None,
        second_layer= None,
        dropout = None,
        activation=None,
        batch_size=None, 
        optimizer=None,
        epochs = 500, 
        verbose = 0, 
        validation_split = 0.2, 
        callbacks = [EarlyStopping(monitor = 'val_loss', patience = 30)],
        random_state=123
    )

    # define the search
    searchMLP = RandomizedSearchCV(estimator=modelMLP, param_distributions=param_vals, 
                                    n_jobs=-1, cv=5, verbose=3, n_iter=60,
                                    scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchMLP.fit(X_train_scaled, y_train)

    # Make predictions from X_test datas
    y_pred = searchMLP.best_estimator_.predict(X_test_scaled)

    mlpCV5results.append(pd.DataFrame(searchMLP.cv_results_).iloc[searchMLP.best_index_])

    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchMLP.best_estimator_))
    medidas.append(searchMLP.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas.append(medidas)

all_medidas_mlp = pd.DataFrame(all_medidas, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_mlp

['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-0.668136,0.694299,0.764785,32.958155,1.036495
1,"['TMAX', 'TMIN', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.535845,0.797531,0.597395,21.98853,0.816247
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.140663,0.98609,0.144088,7.042955,0.216476
3,"['HRMAX', 'HRMIN', 'RADMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.41065,0.92197,0.388857,16.941023,0.509247
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.246152,0.975108,0.233152,11.613631,0.293814
5,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...",KerasRegressor(\n\tmodel=<function get_model a...,-0.131916,0.986687,0.138163,6.506379,0.207617
6,['RADMED'],KerasRegressor(\n\tmodel=<function get_model a...,-0.435398,0.907496,0.4333,19.485051,0.553441
7,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.368559,0.920329,0.373819,14.676515,0.507941
8,"['HRMAX', 'HRMIN', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-1.145667,0.413682,1.084009,46.827688,1.365304
9,"['TMAX', 'TMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-0.784174,0.707339,0.75082,35.810153,0.979704


CR12 Results

In [20]:
from keras.models import Sequential 
from keras.layers import Dense, Dropout
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping 
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from scikeras.wrappers import KerasRegressor

# define search space
param_vals = {    
    "first_layer": range(12, 1024, 12),
    "second_layer": range(4, 256, 4),
    "dropout": [0, 0.2, 0.4, 0.6, 0.8],
    "loss": ["mse", "mae"],
    "optimizer": ["adam", "sgd", "RMSprop"],
    "optimizer__learning_rate": [0.0001, 0.001, 0.01, 0.1],
    "activation": ["relu", "tanh"],
    "batch_size": [32, 64, 128]
    }

def get_model(first_layer, second_layer, dropout, activation, meta):

    model = Sequential() 
    model.add(Dense(first_layer, activation = activation, input_shape = (X_train_scaled.shape[1], ))) 
    model.add(Dropout(dropout))
    model.add(Dense(second_layer, activation = activation)) 
    model.add(Dense(1))
    return model

scoring_metrics = ['neg_mean_absolute_percentage_error', 'neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']

all_medidas = []
mlpCV5results = []
for comb in all_combinations:

    print(comb)
    scaler = StandardScaler().fit(estacionDatas[comb])

    X_train = train[comb]
    if len(comb) == 1:
        X_train = np.array(X_train).reshape(-1,1)

    y_train = train['ETO']
    
    # Fit and transform the training features
    X_train_scaled = scaler.transform(X_train)

    # Fit and transform the test features
    X_test_scaled = scaler.transform(test[comb])
    y_test = test['ETO']

    modelMLP = KerasRegressor(
        get_model,
        metrics = ['mean_absolute_error'],
        loss=None,
        first_layer = None,
        second_layer= None,
        dropout = None,
        activation=None,
        batch_size=None, 
        optimizer=None,
        epochs = 500, 
        verbose = 0, 
        validation_split = 0.2, 
        callbacks = [EarlyStopping(monitor = 'val_loss', patience = 30)],
        random_state=123
    )

    # define the search
    searchMLP = RandomizedSearchCV(estimator=modelMLP, param_distributions=param_vals, 
                                    n_jobs=-1, cv=5, verbose=3, n_iter=60,
                                    scoring=scoring_metrics, refit='neg_mean_absolute_error', random_state=123)
    
    # perform the search
    searchMLP.fit(X_train_scaled, y_train)

    # Make predictions from X_test datas
    y_pred = searchMLP.best_estimator_.predict(X_test_scaled)

    mlpCV5results.append(pd.DataFrame(searchMLP.cv_results_).iloc[searchMLP.best_index_])

    medidas = []

    medidas.append(str(comb))
    medidas.append(str(searchMLP.best_estimator_))
    medidas.append(searchMLP.best_score_)
    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    all_medidas.append(medidas)

all_medidas_mlp = pd.DataFrame(all_medidas, columns=['combination', 'hyperparameters', 'mean_train_mae', 'test_R2', 'test_MAE', 'test_MAPE', 'test_RMSE'])
all_medidas_mlp

['TMAX', 'TMIN', 'HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits
['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['HRMAX', 'HRMIN']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'RADMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



X does not have valid feature names, but StandardScaler was fitted with feature names


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['RADMED', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'VVMED']
Fitting 5 folds for each of 60 candidates, totalling 300 fits



A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



Unnamed: 0,combination,hyperparameters,mean_train_mae,test_R2,test_MAE,test_MAPE,test_RMSE
0,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-0.543779,0.813954,0.674885,25.702076,0.904181
1,"['TMAX', 'TMIN', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.497595,0.845203,0.542333,19.674781,0.708712
2,"['TMAX', 'TMIN', 'RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.169467,0.983716,0.154021,7.636471,0.218733
3,"['HRMAX', 'HRMIN', 'RADMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.458071,0.900664,0.46463,17.995848,0.613155
4,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.247399,0.972127,0.330095,13.104562,0.41309
5,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '...",KerasRegressor(\n\tmodel=<function get_model a...,-0.122395,0.989855,0.124469,5.845172,0.177415
6,['RADMED'],KerasRegressor(\n\tmodel=<function get_model a...,-0.540652,0.858226,0.533538,23.93933,0.682409
7,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.428567,0.903499,0.414188,16.957663,0.558997
8,"['HRMAX', 'HRMIN', 'VVMED']",KerasRegressor(\n\tmodel=<function get_model a...,-0.971173,0.444761,1.061281,45.164199,1.348795
9,"['TMAX', 'TMIN']",KerasRegressor(\n\tmodel=<function get_model a...,-0.698565,0.797533,0.682813,30.288106,0.886365


In [85]:
mlpCV5results = pd.DataFrame(mlpCV5results)
mlpCV5results['Combination'] = all_medidas_mlp['combination'].values
mlpCV5results = mlpCV5results.reset_index().drop(columns=['index', 'params'])
mlpCV5results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_second_layer,param_optimizer__learning_rate,param_optimizer,param_loss,param_first_layer,param_dropout,...,rank_test_r2,split0_test_neg_mean_squared_error,split1_test_neg_mean_squared_error,split2_test_neg_mean_squared_error,split3_test_neg_mean_squared_error,split4_test_neg_mean_squared_error,mean_test_neg_mean_squared_error,std_test_neg_mean_squared_error,rank_test_neg_mean_squared_error,Combination
0,34.031348,10.81405,0.538664,0.123017,236,0.0001,adam,mae,228,0.0,...,19,-2.384202,-2.36338,-2.262656,-2.649901,-2.667592,-2.465546,0.163112,19,"['HRMAX', 'HRMIN']"
1,47.320136,11.377696,0.476486,0.093072,236,0.0001,adam,mae,228,0.0,...,10,-0.783203,-1.193836,-1.169248,-1.167164,-1.173102,-1.097311,0.15734,10,"['TMAX', 'TMIN']"
2,22.877214,6.325655,0.535446,0.273589,164,0.001,adam,mse,84,0.0,...,1,-0.040228,-0.049068,-0.051457,-0.063865,-0.052424,-0.051408,0.007572,1,"['TMAX', 'TMIN', 'RADMED', 'VVMED']"
3,31.374771,10.386174,0.37496,0.097173,236,0.0001,adam,mae,228,0.0,...,10,-0.234491,-0.323786,-0.302206,-0.337288,-0.342837,-0.308122,0.039389,9,"['HRMAX', 'HRMIN', 'RADMED', 'VVMED']"
4,48.54496,18.31738,0.36392,0.078214,236,0.0001,adam,mae,228,0.0,...,2,-0.029664,-0.028662,-0.031057,-0.034678,-0.046785,-0.034169,0.006629,2,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', '..."
5,32.535491,9.886897,0.36394,0.126811,216,0.001,adam,mae,720,0.2,...,22,-0.35461,-0.433219,-0.473927,-0.56383,-0.556565,-0.47643,0.078452,22,['RADMED']
6,18.066767,2.399243,0.309991,0.052687,200,0.01,sgd,mae,108,0.0,...,5,-0.278803,-0.335182,-0.324591,-0.368265,-0.371139,-0.335596,0.033696,5,"['RADMED', 'VVMED']"
7,22.893804,8.903449,0.44626,0.089759,216,0.001,RMSprop,mae,684,0.6,...,21,-2.788717,-2.348182,-2.870661,-2.529172,-2.67687,-2.64272,0.186593,21,['VVMED']
8,19.71893,6.193582,0.334465,0.082296,164,0.001,adam,mse,84,0.0,...,1,-0.083663,-0.093804,-0.1066,-0.11716,-0.140566,-0.108359,0.019695,1,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED']"
9,29.135124,10.837237,0.333725,0.062118,236,0.0001,adam,mae,228,0.0,...,6,-0.548852,-0.769934,-0.656298,-0.671063,-0.714054,-0.67204,0.073175,5,"['TMAX', 'TMIN', 'HRMAX', 'HRMIN']"


# Train the final models

In [9]:
# Define the best combination of each number of variables
param4 = ['TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'VVMED']
param3 = ['TMAX', 'TMIN', 'RADMED', 'VVMED']
param2 = ['TMAX', 'TMIN', 'RADMED']
param1 = ['RADMED']

all_params = [param4, param3, param2, param1]

In [10]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
# Train with all observations of the station
X_train = estacionDatas.drop(columns='ETO')
y_train = estacionDatas['ETO']
X_train_scaled = pd.DataFrame(StationScaler.transform(X_train), index=X_train.index, columns=X_train.columns)
X_train_scaled

Unnamed: 0_level_0,TMAX,TMIN,HRMAX,HRMIN,RADMED,VVMED
FECHA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-01,-1.354227,-0.712006,0.027038,0.758373,-0.840810,3.941095
2010-01-02,-0.895870,-0.700295,0.616886,0.464314,-0.917580,-0.216367
2010-01-03,-1.040284,-1.130288,0.832101,0.869991,-1.297883,-0.505750
2010-01-04,-1.096794,-0.501192,1.127025,2.111388,-1.655698,-0.621503
2010-01-05,-0.968705,-0.253570,0.329933,1.159518,-1.195745,0.709657
...,...,...,...,...,...,...
2023-06-13,0.568359,0.937696,1.213111,-0.199729,0.824526,-0.544334
2023-06-14,0.653751,0.880810,1.589339,-0.033151,1.208707,-0.602211
2023-06-15,0.984019,0.875791,1.294415,-0.195763,1.698902,-0.708317
2023-06-16,1.105829,0.964466,1.589339,-0.481323,1.537276,-0.534688


### NOTE: The best models for each station are different, the code will execute the corresponding bloc depending the value of stationCode.

In [8]:
from sklearn.svm import SVR
from keras.models import Sequential 
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping 
from keras.optimizers import Adam
from scikeras.wrappers import KerasRegressor

In [11]:
print('Training final models for station: ' + stationCode)
if stationCode == 'CI42':
    ############## BEST MODELS FOR CI42 ##############
    # Model 4 - SVR
    modelo4 = SVR(C=10, epsilon=0.01, gamma=0.1)

    # Model 3 - MLP
    def get_model(first_layer, second_layer, dropout, activation, meta):

        model = Sequential() 
        model.add(Dense(first_layer, activation = activation, input_shape = (X_train_scaled[param3].shape[1], ))) 
        model.add(Dropout(dropout))
        model.add(Dense(second_layer, activation = activation)) 
        model.add(Dense(1))
        return model

    modelo3 =  KerasRegressor(
            get_model,
            metrics = ['mean_absolute_error'],
            loss='mse',
            first_layer = 84,
            second_layer= 164,
            dropout = 0,
            activation='relu',
            batch_size=128, 
            optimizer=Adam(learning_rate=0.001),
            epochs = 500, 
            verbose = 0, 
            validation_split = 0.2, 
            callbacks = [EarlyStopping(monitor = 'val_loss', patience = 30)],
            random_state=123
    )

    # Model 2 - SVR
    modelo2 = SVR(C=10, epsilon=0.01, gamma=0.1)

    # Model 1 - SVR
    modelo1 = SVR(C=50, gamma=1)

    # Train all models 
    def entrenarLosModelos(modelos):
        for idx, m in enumerate(modelos):
            m.fit(X_train_scaled[all_params[idx]], estacionDatas['ETO']) 
        return modelos

    modelos = entrenarLosModelos([modelo4, modelo3, modelo2, modelo1])


elif stationCode == 'CA91':

    ############## BEST MODELS FOR CA91 ##############
    # Model 4 - MLP
    def get_model(first_layer, second_layer, dropout, activation, meta):

        model = Sequential() 
        model.add(Dense(first_layer, activation = activation, input_shape = (X_train_scaled[param4].shape[1], ))) 
        model.add(Dropout(dropout))
        model.add(Dense(second_layer, activation = activation)) 
        model.add(Dense(1))
        return model

    modelo4 =  KerasRegressor(
            get_model,
            metrics = ['mean_absolute_error'],
            loss='mae',
            first_layer = 228,
            second_layer= 236,
            dropout = 0,
            activation='relu',
            batch_size=32, 
            optimizer=Adam(learning_rate=0.0001),
            epochs = 500, 
            verbose = 0, 
            validation_split = 0.2, 
            callbacks = [EarlyStopping(monitor = 'val_loss', patience = 30)],
            random_state=123
    )

    # Model 3 - SVR
    modelo3 = SVR(C=100, gamma=0.1)

    # Model 2 - SVR
    modelo2 = SVR(C=1000, gamma=0.1)

    # Model 1 - SVR
    modelo1 = SVR(C=1, epsilon=0.01, gamma=10)

    # Train all models 
    def entrenarLosModelos(modelos):
        for idx, m in enumerate(modelos):
            m.fit(X_train_scaled[all_params[idx]], estacionDatas['ETO']) 
        return modelos

    modelos = entrenarLosModelos([modelo4, modelo3, modelo2, modelo1])

elif stationCode == 'CR12':

    ############## BEST MODELS FOR CR12 ##############
    # Modelo 4 - SVR
    modelo4 = SVR(C=10, epsilon=0.01, gamma=0.1)

    # Modelo 3 - SVR
    modelo3 = SVR(C=100, gamma=0.1)

    # Modelo 2 - MLP
    def get_model(first_layer, second_layer, dropout, activation, meta):

        model = Sequential() 
        model.add(Dense(first_layer, activation = activation, input_shape = (X_train_scaled[param2].shape[1], ))) 
        model.add(Dropout(dropout))
        model.add(Dense(second_layer, activation = activation)) 
        model.add(Dense(1))
        return model

    modelo2 =  KerasRegressor(
            get_model,
            metrics = ['mean_absolute_error'],
            loss='mae',
            first_layer = 288,
            second_layer= 144,
            dropout = 0.4,
            activation='relu',
            batch_size=64, 
            optimizer=Adam(learning_rate=0.001),
            epochs = 500, 
            verbose = 0, 
            validation_split = 0.2, 
            callbacks = [EarlyStopping(monitor = 'val_loss', patience = 30)],
            random_state=123
    )


    # Modelo 1 - SVR
    modelo1 = SVR(C=1, epsilon=0.01, gamma=1)

    # Entrenar todos los modelos con los datos de la estacion
    def entrenarLosModelos(modelos):
        for idx, m in enumerate(modelos):
            #m.fit(X_train_scaled[all_params[idx]], y_train) # para test local
            m.fit(X_train_scaled[all_params[idx]], estacionDatas['ETO']) 
        return modelos

    modelos = entrenarLosModelos([modelo4, modelo3, modelo2, modelo1])

else:
    print('Error in station code -> ', stationCode, ' no match any of three training stations.')

Training final models for station: CR12


# Estimation test at different scale

## Read stations from other stations of Murcia and SIAR

In [12]:
import os

# Read stations from SIAR
def leerEstacionSiar(path):
       estacion = pd.read_csv(path, encoding="utf-16", sep=';', na_values='0')
       estacion.columns = ['IdProvincia', 'IdEstacion', 'Fecha', 'Año', 'Dia', 'Temp Max (ºC)',
              'Temp Mínima (ºC)', 'Humedad Max (%)', 'Humedad Min (%)',
              'Velviento (m/s)', 'Radiación (MJ/m2)', 'EtPMon']
       estacion = estacion[['Fecha', 'EtPMon', 'Temp Max (ºC)',
              'Temp Mínima (ºC)', 'Humedad Max (%)', 'Humedad Min (%)','Radiación (MJ/m2)', 'Velviento (m/s)'  ]]
       estacion['Fecha'] = pd.to_datetime(estacion['Fecha'], format='%d/%m/%Y')

       estacion.dropna(inplace=True)
       estacion.index = estacion['Fecha']
       estacion.drop(columns='Fecha', inplace=True)
       estacion.columns = estacionDatas.columns
       for i in estacion.columns:
              estacion[i] = pd.to_numeric(estacion[i].apply(lambda x : convertirComa(x)))
       estacion['RADMED'] = estacion['RADMED'].apply(lambda x: x / 0.0864)
       estacion = estacion[estacionDatas.columns]
       estacion = estacion[(estacion.index >= '2017-01-01') & (estacion.index < '2023-01-01')]

       if estacion.duplicated().sum() > 0:
              print(path)
              print('Fechas repetidas:', estacion.duplicated().sum())

       return estacion

# SIAR stations
dirSiar = './siar/'
ficheros = os.listdir(dirSiar)
estacionesSiar = []
nombreEstacionesSiar = []
for f in ficheros:
    if 'csv' not in f:
        continue
    estacionesSiar.append(leerEstacionSiar(dirSiar+f))
    nombreEstacionesSiar.append(f.split('.')[0])

# Stations from Murcia Region
murciaDir = './all data murcia/'
nombreEstacionesMurcia = []
estacionesMurcia = []
for f in os.listdir(murciaDir):
    if stationCode in f:
        continue
    df = leerEstacionDatos(murciaDir+f)
    df = df[df.index >= '2017-01-01']
    df = df[df.index <= '2023-06-17']
    estacionesMurcia.append(df)
    nombreEstacionesMurcia.append(f.split('.')[0])

## Evaluate the models

In [13]:
# Obtain all statistical indicators with y_test and y_pred
def obtenerMedidas(y_test, y_pred, graficas=False):    

    medidas = []

    medidas.append(np.corrcoef(y_test, y_pred)[0][1]**2)
    medidas.append(mean_absolute_error(y_true=y_test,y_pred=y_pred))
    medidas.append(mean_absolute_percentage_error(y_true=y_test,y_pred=y_pred)*100)
    medidas.append(mean_squared_error(y_true=y_test,y_pred=y_pred,squared=False))

    if graficas:
        fig = go.Figure()
        fig.add_trace(go.Scatter(x=y_test.index, y=y_test,
                            name='real', mode='lines'))
        fig.add_trace(go.Scatter(x=y_test.index, y=y_pred,
                            name='prediction', mode='lines'))
        fig.show()

        fig = px.scatter(x=y_test, y=y_pred, labels={'x': 'real', 'y':'pred'}, trendline='ols')
        fig.show()
    return medidas

# Evaluate all 4 models 
def evaluarModelos(modelos, estacion, graficas=False):

    medidas_modelos = []

    X_test = estacion.drop(columns='ETO')
    y_test = estacion['ETO']
    X_test_scaled = pd.DataFrame(StationScaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    for idx, modelo in enumerate(modelos):
        y_pred = modelo.predict(X_test_scaled[all_params[idx]]) 
        medidas = obtenerMedidas(y_test, y_pred, graficas)
        medidas.append('modelo'+str(4-idx))
        medidas.append(all_params[idx])

        medidas_modelos.append(medidas)

    return pd.DataFrame(medidas_modelos, columns=['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Variables'])

# Evaluate a station with all 4 models
def evaluarModelosEstaciones(modelos, estaciones, nombres):
    all_estaciones = []
    for idx, estacion in enumerate(estaciones):
        df = evaluarModelos(modelos, estacion)
        df['Estacion'] = nombres[idx]
        all_estaciones.append(df)

    return pd.concat(all_estaciones)


### Regional (Murcia)

In [14]:
pd.set_option('display.max_rows', None)
resultados = evaluarModelosEstaciones(modelos, estacionesMurcia, nombreEstacionesMurcia)[['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Estacion']]
resultados

Unnamed: 0,R2,MAE,MAPE,RMSE,Modelo,Estacion
0,0.989131,0.168198,5.770792,0.223265,modelo4,AL41
1,0.977182,0.333448,10.41474,0.414234,modelo3,AL41
2,0.923784,0.482673,15.966257,0.64409,modelo2,AL41
3,0.828063,0.793285,22.237859,1.027126,modelo1,AL41
0,0.956587,0.342049,17.509309,0.419892,modelo4,CA73
1,0.958261,0.365178,17.251919,0.437761,modelo3,CA73
2,0.929193,0.569837,26.601023,0.650508,modelo2,CA73
3,0.870935,0.481328,19.887251,0.598705,modelo1,CA73
0,0.980936,0.262739,14.00878,0.33014,modelo4,CA91
1,0.979179,0.303887,14.444602,0.35267,modelo3,CA91


In [15]:
resultados['Modelo'] = resultados['Modelo'].apply(lambda x: x.replace('modelo', 'M'))
resultados.groupby(['Modelo']).describe()[[(    'R2',  'mean'),
            (   'MAE',  'mean'),
            (  'MAPE',  'mean'),
            (  'RMSE',  'mean'),
            ]].iloc[::-1]

Unnamed: 0_level_0,R2,MAE,MAPE,RMSE
Unnamed: 0_level_1,mean,mean,mean,mean
Modelo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M4,0.980155,0.233769,10.384614,0.296103
M3,0.972465,0.30593,12.413886,0.372819
M2,0.917456,0.522353,22.023978,0.654224
M1,0.852543,0.611513,21.842455,0.793234


### National

In [16]:
pd.set_option('display.max_rows', None)

In [17]:
resultados = evaluarModelosEstaciones(modelos, estacionesSiar, nombreEstacionesSiar)[['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Estacion']]
resultados['Modelo'] = resultados['Modelo'].apply(lambda x: x.replace('modelo', 'M'))
resultados['Estacion'] = resultados['Estacion'].apply(lambda x: x.split(' ')[0])
resultados

Unnamed: 0,R2,MAE,MAPE,RMSE,Modelo,Estacion
0,0.991885,0.138919,5.764658,0.186809,M4,A19
1,0.978751,0.371498,14.774788,0.457415,M3,A19
2,0.95141,0.378975,15.757721,0.489191,M2,A19
3,0.884516,0.578049,19.429732,0.761472,M1,A19
0,0.994032,0.140156,6.917188,0.183206,M4,AB05
1,0.978947,0.410398,19.907743,0.513242,M3,AB05
2,0.945459,0.456594,20.975442,0.604997,M2,AB05
3,0.833944,0.739742,28.155046,0.99076,M1,AB05
0,0.978729,0.304115,14.679088,0.372164,M4,AL02
1,0.973239,0.378845,15.516052,0.438799,M3,AL02


In [18]:
pd.set_option('display.max_rows', 20)
resultados.groupby(['Modelo']).describe()[[(    'R2',  'mean'),
            (   'MAE',  'mean'),
            (  'MAPE',  'mean'),
            (  'RMSE',  'mean'),
            ]].iloc[::-1]

Unnamed: 0_level_0,R2,MAE,MAPE,RMSE
Unnamed: 0_level_1,mean,mean,mean,mean
Modelo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M4,0.975426,0.204046,10.955874,0.268076
M3,0.9578,0.354959,18.878056,0.442386
M2,0.917267,0.50191,26.891593,0.628906
M1,0.845528,0.58132,27.188626,0.763679


# Forecasting test

## Read real and forecast data

In [19]:
# Function that read the forecast data (hourly) of a specific station and convert it into daily values according each variable:
# T, Hr -> max and min
# U2 and Rs -> mean
def leerPredicciones(path):
    df = pd.read_csv(path)
    df['dates'] = pd.to_datetime(df['dates'])
    df.drop(columns=['Estacion', 'Servicio'], inplace=True)
    # Sacar el DF de WB agrupado por dias y con las variables calculadas 
    punto = [l[1] for l in list(df.groupby([df['dates'].dt.date]))]
    FECHA = []
    TMAX = []
    TMIN = []
    HRMAX = []
    HRMIN = []
    VVMED = []
    RADMED = []
    for p in punto:
        FECHA.append(pd.to_datetime(p['dates']).dt.date.iloc[0])
        TMAX.append(p['temp'].max())
        TMIN.append(p['temp'].min())
        HRMAX.append(p['rh'].max())
        HRMIN.append(p['rh'].min())
        VVMED.append(p['wind'].mean())
        RADMED.append(p['solar_rad'].mean())

    return pd.DataFrame({
        "FECHA": pd.to_datetime(FECHA),
        "TMAX": TMAX,
        "TMIN": TMIN,
        "HRMAX": HRMAX,
        "HRMIN": HRMIN,
        "VVMED": VVMED,
        "RADMED": RADMED
    }
    )

In [20]:
def leerPredictionTest():
    dir = './forecastTest/'
    subdirs = os.listdir(dir)

    dfs_wb = []
    dfs_vc = []
    for subdir in subdirs:

        # Fichero ETo real
        station = subdir.split('-')[0]
        eto = leerEstacionDatos('./all data murcia/' + station + '.csv')
        eto = eto[eto.index >= '2023-06-18']
        eto.reset_index(inplace=True)

        loc = dir+subdir+'/'

        # Ficheros de WB y VC
        df_wb = leerPredicciones(loc + 'WB-'+ subdir + '.csv')
        df_wb = pd.merge(df_wb, eto[['FECHA', 'ETO']], on='FECHA')
        df_wb['VVMED'] = df_wb['VVMED'].apply(lambda x: x*4.87/np.log(67.8*10-5.42))

        df_vc = leerPredicciones(loc + 'VC-'+ subdir + '.csv')
        df_vc = pd.merge(df_vc, eto[['FECHA', 'ETO']], on='FECHA')
        df_vc['VVMED'] = df_vc['VVMED'].apply(lambda x: x*4.87/np.log(67.8*10-5.42))


        dfs_wb.append(df_wb)
        dfs_vc.append(df_vc)

    return dfs_wb, dfs_vc, subdirs

dfs_wb, dfs_vc, locs = leerPredictionTest()

## Test using 2 weather services (WeatherBit and VisualCrossing)

### TEST WB

In [21]:
pd.set_option('display.max_rows', None)
resultadosPredicciones = evaluarModelosEstaciones(modelos=modelos, estaciones=[df[estacionDatas.columns] for df in dfs_wb]  , nombres=locs)[['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Estacion']]
resultadosPredicciones['Modelo'] = resultadosPredicciones['Modelo'].apply(lambda x: x.replace('modelo', 'M'))
resultadosPredicciones['Estacion'] = resultadosPredicciones['Estacion'].apply(lambda x: x.split('-')[0])
resultadosPredicciones

Unnamed: 0,R2,MAE,MAPE,RMSE,Modelo,Estacion
0,0.943649,0.532368,34.941536,0.667424,M4,AL41
1,0.933912,0.585007,32.843029,0.719343,M3,AL41
2,0.900442,0.613593,39.432167,0.779165,M2,AL41
3,0.877071,0.748499,34.085609,0.942381,M1,AL41
0,0.924794,0.722977,33.092748,0.826258,M4,CA73
1,0.875616,1.226613,43.964621,1.459792,M3,CA73
2,0.90758,0.888555,36.071464,1.009869,M2,CA73
3,0.898239,0.504046,19.896885,0.642851,M1,CA73
0,0.928847,0.812035,40.540078,0.94539,M4,CA91
1,0.914746,1.028104,41.205275,1.20599,M3,CA91


In [22]:
resultadosPredicciones.groupby(['Modelo']).describe()[[(    'R2',  'mean'),
            (   'MAE',  'mean'),
            (  'MAPE',  'mean'),
            (  'RMSE',  'mean'),
            ]].iloc[::-1]

Unnamed: 0_level_0,R2,MAE,MAPE,RMSE
Unnamed: 0_level_1,mean,mean,mean,mean
Modelo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M4,0.931483,0.617747,26.262821,0.750168
M3,0.916161,0.797751,28.968883,0.974875
M2,0.899742,0.670102,28.218268,0.826667
M1,0.880499,0.609413,22.544391,0.791256


### Test VC

In [23]:
resultadosPredicciones = evaluarModelosEstaciones(modelos=modelos, estaciones=[df[estacionDatas.columns] for df in dfs_vc]  , nombres=locs)[['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Estacion']]
resultadosPredicciones['Modelo'] = resultadosPredicciones['Modelo'].apply(lambda x: x.replace('modelo', 'M'))
resultadosPredicciones['Estacion'] = resultadosPredicciones['Estacion'].apply(lambda x: x.split('-')[0])
resultadosPredicciones

Unnamed: 0,R2,MAE,MAPE,RMSE,Modelo,Estacion
0,0.956526,0.522932,35.407918,0.645031,M4,AL41
1,0.945808,0.658089,36.343959,0.783533,M3,AL41
2,0.909159,0.631414,39.649125,0.773921,M2,AL41
3,0.868906,0.700483,33.889964,0.914942,M1,AL41
0,0.850855,0.667492,32.66915,0.797096,M4,CA73
1,0.826335,1.329144,46.417631,1.664966,M3,CA73
2,0.902598,0.901262,35.353596,1.043393,M2,CA73
3,0.888656,0.569747,21.648278,0.719449,M1,CA73
0,0.899106,0.825937,42.875592,0.989027,M4,CA91
1,0.887638,1.107416,44.657864,1.329922,M3,CA91


In [24]:
pd.set_option('display.max_rows', 20)
resultadosPredicciones.groupby(['Modelo']).describe()[[(    'R2',  'mean'),
            (   'MAE',  'mean'),
            (  'MAPE',  'mean'),
            (  'RMSE',  'mean'),
            ]].iloc[::-1]

Unnamed: 0_level_0,R2,MAE,MAPE,RMSE
Unnamed: 0_level_1,mean,mean,mean,mean
Modelo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M4,0.917989,0.617354,26.882796,0.753397
M3,0.905785,0.838346,30.556267,1.029542
M2,0.900682,0.698372,28.617882,0.84828
M1,0.877231,0.605356,22.668581,0.789731


## Normalize the forecast U2

In [25]:
# Normalize the given values into custom min-max range.
def scale_to_custom_range(values, custom_min, custom_max):
    min_value = min(values)
    max_value = max(values)

    scaled_values = [(x - min_value) / (max_value - min_value) * (custom_max - custom_min) + custom_min for x in values]
    return scaled_values

In [26]:
def leerPredictionTestNormalizeU2():
    dir = './forecastTest/'
    subdirs = os.listdir(dir)

    dfs_wb = []
    dfs_vc = []
    for subdir in subdirs:

        # Read the real eto data
        loc = dir+subdir+'/'
        eto = leerEstacionDatos(loc + subdir + '.csv')
        eto.reset_index(inplace=True)
        eto.columns = ['FECHA', 'ETO', 'TMAX', 'TMIN', 'HRMAX', 'HRMIN', 'RADMED', 'U2']

        # Read WB data
        df_wb = leerPredicciones(loc + 'WB-'+ subdir + '.csv')
        df_wb = pd.merge(df_wb, eto[['FECHA', 'ETO', 'U2']], on='FECHA')
        df_wb['VVMED'] = df_wb['VVMED'].apply(lambda x: x*4.87/np.log(67.8*30-5.42))

        # Split randomly the forecast WB data into train (X) and test set (y)
        X_wb,y_wb = train_test_split(df_wb, test_size = 0.5, shuffle=True, random_state=123)
        # Using the train set to extract the max and min values and normalize the test set
        y_wb['VVMED'] = scale_to_custom_range(y_wb['VVMED'], X_wb['U2'].min(), X_wb['U2'].max())
        # Delete the original U2 column
        y_wb.drop(columns='U2', inplace=True)

        # Read VC data
        df_vc = leerPredicciones(loc + 'VC-'+ subdir + '.csv')
        df_vc = pd.merge(df_vc, eto[['FECHA', 'ETO', 'U2']], on='FECHA')
        df_vc['VVMED'] = df_vc['VVMED'].apply(lambda x: x*4.87/np.log(67.8*30-5.42))

        # Same process as WB
        X_vc,y_vc = train_test_split(df_vc, test_size = 0.5, shuffle=True, random_state=123)
        y_vc['VVMED'] = scale_to_custom_range(y_vc['VVMED'], X_vc['U2'].min(), X_vc['U2'].max())
        y_vc.drop(columns='U2', inplace=True)

        dfs_wb.append(y_wb)
        dfs_vc.append(y_vc)

    return dfs_wb, dfs_vc, subdirs

dfs_wb, dfs_vc, locs = leerPredictionTestNormalizeU2()

### Test WB

In [27]:
pd.set_option('display.max_rows', None)
resultadosPredicciones = evaluarModelosEstaciones(modelos=modelos, estaciones=[df[estacionDatas.columns] for df in dfs_wb]  , nombres=locs)[['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Estacion']]
resultadosPredicciones['Modelo'] = resultadosPredicciones['Modelo'].apply(lambda x: x.replace('modelo', 'M'))
resultadosPredicciones['Estacion'] = resultadosPredicciones['Estacion'].apply(lambda x: x.split('-')[0])
resultadosPredicciones

Unnamed: 0,R2,MAE,MAPE,RMSE,Modelo,Estacion
0,0.963284,0.509721,29.314137,0.606332,M4,AL41
1,0.953809,0.432051,26.478821,0.57138,M3,AL41
2,0.911163,0.636533,43.994075,0.832746,M2,AL41
3,0.8976,0.755376,34.304193,0.943342,M1,AL41
0,0.943865,0.40528,20.824233,0.523644,M4,CA73
1,0.934303,0.632511,23.798502,0.752421,M3,CA73
2,0.924486,0.854501,34.631003,0.975218,M2,CA73
3,0.917028,0.46246,18.597361,0.60553,M1,CA73
0,0.956665,0.436142,24.211404,0.539953,M4,CA91
1,0.957897,0.447843,21.074069,0.580551,M3,CA91


In [28]:
resultadosPredicciones.groupby(['Modelo']).describe()[[(    'R2',  'mean'),
            (   'MAE',  'mean'),
            (  'MAPE',  'mean'),
            (  'RMSE',  'mean'),
            ]].iloc[::-1]

Unnamed: 0_level_0,R2,MAE,MAPE,RMSE
Unnamed: 0_level_1,mean,mean,mean,mean
Modelo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M4,0.946639,0.416708,18.276707,0.540901
M3,0.939692,0.492551,19.363155,0.636614
M2,0.908822,0.673225,29.74072,0.838678
M1,0.892297,0.622316,23.388262,0.799922


### Test VC

In [29]:
pd.set_option('display.max_rows', None)
resultadosPredicciones = evaluarModelosEstaciones(modelos=modelos, estaciones=[df[estacionDatas.columns] for df in dfs_vc]  , nombres=locs)[['R2', 'MAE', 'MAPE', 'RMSE', 'Modelo', 'Estacion']]
resultadosPredicciones['Modelo'] = resultadosPredicciones['Modelo'].apply(lambda x: x.replace('modelo', 'M'))
resultadosPredicciones['Estacion'] = resultadosPredicciones['Estacion'].apply(lambda x: x.split('-')[0])
resultadosPredicciones

Unnamed: 0,R2,MAE,MAPE,RMSE,Modelo,Estacion
0,0.970796,0.485724,23.627485,0.588027,M4,AL41
1,0.961024,0.384464,22.703367,0.504728,M3,AL41
2,0.921637,0.658707,43.70696,0.816994,M2,AL41
3,0.890217,0.700839,33.710072,0.91667,M1,AL41
0,0.901245,0.534808,24.522918,0.628605,M4,CA73
1,0.921521,0.588557,23.257096,0.712552,M3,CA73
2,0.92327,0.862872,33.378362,1.002348,M2,CA73
3,0.90502,0.53923,20.683206,0.695669,M1,CA73
0,0.948777,0.514272,27.912464,0.59543,M4,CA91
1,0.955181,0.533233,23.995938,0.648417,M3,CA91


In [30]:
pd.set_option('display.max_rows', 20)
resultadosPredicciones.groupby(['Modelo']).describe()[[(    'R2',  'mean'),
            (   'MAE',  'mean'),
            (  'MAPE',  'mean'),
            (  'RMSE',  'mean'),
            ]].iloc[::-1]

Unnamed: 0_level_0,R2,MAE,MAPE,RMSE
Unnamed: 0_level_1,mean,mean,mean,mean
Modelo,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
M4,0.93831,0.466001,19.387338,0.579236
M3,0.935514,0.52888,19.992367,0.667824
M2,0.913836,0.694085,29.602159,0.84526
M1,0.891275,0.613032,23.330662,0.794443
