# Pré processamento
* Leia o arquivo Bias_correction_ucl.csv

In [1]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00514/Bias_correction_ucl.csv"

original_ko_temp_forecast_df = pd.read_csv(url)

* Remova a coluna "Next_Tmin".
* Remova a coluna Date

In [2]:
ko_temp_forecast_df = original_ko_temp_forecast_df.drop(['Next_Tmin', 'Date'], axis=1)

* Remova as linhas que tem valor faltante. Das 7752 linhas originais sobram 7588

In [3]:
ko_temp_forecast_df.shape

(7752, 23)

In [4]:
ko_temp_forecast_df.dropna(inplace=True)

In [5]:
ko_temp_forecast_df.shape

(7588, 23)

* O atributo de saída é Next_Tmax (a temperatura máxima no próximo dia). Vamos removê-lo do conjunto de dados para evitar contaminação durante o processo de normalização.


In [6]:
next_tmax = ko_temp_forecast_df['Next_Tmax']
ko_temp_forecast_df.drop('Next_Tmax', axis=1, inplace=True)

* Centralize e normalize cada atributo de entrada

In [7]:
from sklearn import preprocessing

prep_ko_temp_forecast_np = preprocessing.scale(ko_temp_forecast_df)
prep_ko_temp_forecast_np.shape

(7588, 22)

# Busca de hiperparâmetros e modelos utilizando:
* 5 fold cross validation como técnica de resampling separando os dados em conjuntos de 70% de treino e 30% de teste;
* RMSE como medida de erro;
* Busca aleatória de hiperparametros (nos modelos aplicáveis)

In [34]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
import numpy as np
import random

# Numero de vezes em que serão gerados valores aleatórios de hiperparâmetros
HYPERPARAM_SAMPLING_N = 10

def rmse_positive(model):
    # Calcula o valor de RMSE no conjunto de dados pré processado dado o modelo
    # especificado
    scores = cross_val_score(model, \
                           prep_ko_temp_forecast_np,
                           next_tmax, \
                           cv = ShuffleSplit(n_splits=5, test_size=0.3), \
                           scoring=('neg_root_mean_squared_error'))
    return np.mean(- scores)

## Linear

In [32]:
from sklearn.linear_model import LinearRegression

traditional_lr_rmse = rmse_positive(LinearRegression())

## Linear com regularização L2
Sendo alpha escolhido com: 10 números aleatórios entre 10<sup>-3</sup> e 10<sup>3</sup>

In [39]:
from sklearn.linear_model import Ridge

#--------------------------------------------------------------------------
# Default hyperparameters
default_ridge_lr_RMSE = rmse_positive(Ridge())

#--------------------------------------------------------------------------
# Tunning hyperparams
best_ridge_lr_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    ridge = Ridge(alpha=random.uniform(10**-3, 10**3))
    
    mean_iter_scores = rmse_positive(ridge)

    if best_ridge_lr_RMSE > mean_iter_scores:
        best_ridge_lr_RMSE = mean_iter_scores  

## 2.3 Linear com regularização L1
Sendo alpha escolhido com: 10 números aleatórios entre 10-3 e 103

In [16]:
from sklearn.linear_model import Lasso

#--------------------------------------------------------------------------
# Default hyperparameters
default_lasso_lr_RMSE = rmse_positive(Lasso())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_lasso_lr_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    lasso = Lasso(alpha=random.uniform(10**-3, 10**3))
    
    mean_iter_scores = rmse_positive(lasso)
    
    if best_lasso_lr_RMSE > mean_iter_scores:
        best_lasso_lr_RMSE = mean_iter_scores  

## 2.4 SVM Linear
Sendo:
* epsilon = 0.1 ou 0.3
* 2 <sup>-5</sup> >= C >= 2 <sup>15</sup>


In [18]:
from sklearn.svm import LinearSVR

#--------------------------------------------------------------------------
# Default hyperparameters
default_svr_linear_RMSE = rmse_positive(LinearSVR())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_svr_linear_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    svr = LinearSVR(C = random.uniform(2**-5, 2**15), \
                    epsilon = random.choice([0.1, 0.3]))
    
    mean_iter_scores = rmse_positive(svr)
    
    if best_svr_linear_RMSE > mean_iter_scores:
        best_svr_linear_RMSE = mean_iter_scores  

## 2.5 SVM com kernel RBF
Sendo:
* epsilon = 0.1 ou 0.3
* 2 <sup>-5</sup> >= C >= 2 <sup>15</sup>
* 2 <sup>-9</sup> >= gamma >= 2 <sup>3</sup>

In [21]:
from sklearn.svm import SVR

#--------------------------------------------------------------------------
# Default hyperparameters
default_svr_rbf_RMSE = rmse_positive(SVR(kernel='rbf'))

#--------------------------------------------------------------------------
# Tunning hyperparams

best_svr_rbf_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    
    svr = SVR(epsilon = random.choice([0.1, 0.3]), \
              C = random.uniform(2**-5, 2**15), \
              gamma = random.uniform(2**-9, 2**3), \
              kernel='rbf')
    
    mean_iter_scores = rmse_positive(svr)
    
    if best_svr_rbf_RMSE > mean_iter_scores:
        best_svr_rbf_RMSE = mean_iter_scores

## 2.6 KNN
Sendo K: 10 números aleatórios entre 1 e 1000

In [23]:
from sklearn.neighbors import KNeighborsRegressor

#--------------------------------------------------------------------------
# Default hyperparameters
default_knn_regr_RMSE = rmse_positive(KNeighborsRegressor())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_knn_regr_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    knn_regressor = KNeighborsRegressor(n_neighbors=random.randint(1, 1000))
    
    mean_iter_scores = rmse_positive(knn_regressor)
    
    if best_knn_regr_RMSE > mean_iter_scores:
        best_knn_regr_RMSE = mean_iter_scores

## 2.7 MLP
Neurônios na camada do meio: de 5 a 20, de três em três

In [24]:
from sklearn.neural_network import MLPRegressor

#--------------------------------------------------------------------------
# Default hyperparameters
default_mlp_regr_RMSE = rmse_positive(MLPRegressor())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_mlp_regr_RMSE = 10**3

neurons = 5

while neurons <= 20:
    mlp_regressor = MLPRegressor(hidden_layer_sizes=neurons)
    
    mean_iter_scores = rmse_positive(mlp_regressor)
    
    if best_mlp_regr_RMSE > mean_iter_scores:
        best_mlp_regr_RMSE = mean_iter_scores
    
    neurons +=3

## 2.8 Arvore de decisão
* Usando prunning com ccp_alpha sendo 10 números aleatórios entre 0.0 e 0.04.


In [26]:
from sklearn.tree import DecisionTreeRegressor

#--------------------------------------------------------------------------
# Default hyperparameters
default_dec_tree_RMSE = rmse_positive(DecisionTreeRegressor())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_dec_tree_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    dec_tree_regressor = DecisionTreeRegressor(ccp_alpha=random. \
                                               uniform(0.0, 0.04))
    
    mean_iter_scores = rmse_positive(dec_tree_regressor)
    
    if best_dec_tree_RMSE > mean_iter_scores:
        best_dec_tree_RMSE = mean_iter_scores

## 2.9 Random Forest
Usando todas as combinações dos valores abaixo.
* n_estimators: use os valores: 10, 100 e 1000;
* max_features: use os valores 5, 10, e 22.

In [27]:
from sklearn.ensemble import RandomForestRegressor

#--------------------------------------------------------------------------
# Default hyperparameters
default_rand_for_RMSE = rmse_positive(RandomForestRegressor())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_rand_for_RMSE = 10**3
n_estimators = [10, 100, 1000]
max_features = [5, 10, 22]

for estimator in n_estimators:
    for max_feature in max_features:
        dec_tree_regressor = RandomForestRegressor(n_estimators = estimator, \
                                                   max_features = max_feature)
        mean_iter_scores = rmse_positive(dec_tree_regressor)
        
        if best_rand_for_RMSE > mean_iter_scores:
            best_rand_for_RMSE = mean_iter_scores

## 2.10 GBM
Selecionando 10 trinca aleatórias ente:
* n_estimators: de 5 a 100;
* learning_rate: de 0.01 a 0.3;
* max_depth: 2 ou 3.

In [28]:
from sklearn.ensemble import GradientBoostingRegressor

#--------------------------------------------------------------------------
# Default hyperparameters
default_gbm_RMSE = rmse_positive(GradientBoostingRegressor())

#--------------------------------------------------------------------------
# Tunning hyperparams

best_gbm_RMSE = 10**3

for i in range(HYPERPARAM_SAMPLING_N):
    gbm = GradientBoostingRegressor(\
            max_depth = random.choice([2,3]), \
            learning_rate = random.uniform(0.01, 0.3), \
            n_estimators = random.randint(5, 100)
            )
    
    mean_iter_scores = rmse_positive(gbm)
    
    if best_gbm_RMSE > mean_iter_scores:
        best_gbm_RMSE = mean_iter_scores

# 3. Tabela final
Tabela final com cada classificador, os valores do RMSE com valore default para os hiperparametros, e o valor do RMSE com o melhor valor dos hiperparametros.

In [40]:
columns = ['Regressor', 'RMSE default', 'RMSE hiperparametros']
data = [
        ['Linear',            traditional_lr_rmse,    'N/A'], \
        ['Ridge',             default_ridge_lr_RMSE,   best_ridge_lr_RMSE], \
        ['Lasso',             default_lasso_lr_RMSE,   best_lasso_lr_RMSE], \
        ['SVR Linear',        default_svr_linear_RMSE, best_svr_linear_RMSE], \
        ['SVR RBF',           default_svr_rbf_RMSE,    best_svr_rbf_RMSE], \
        ['KNN',               default_knn_regr_RMSE,   best_knn_regr_RMSE], \
        ['MLP',               default_mlp_regr_RMSE,   best_mlp_regr_RMSE], \
        ['Arvore de decisao', default_dec_tree_RMSE,   best_dec_tree_RMSE], \
        ['Random Forest',     default_rand_for_RMSE,   best_rand_for_RMSE], \
        ['GBM',               default_gbm_RMSE,        best_gbm_RMSE], \
       ]

final_table = pd.DataFrame(data, columns = columns)

In [41]:
final_table

Unnamed: 0,Regressor,RMSE default,RMSE hiperparametros
0,Linear,1.458478,
1,Ridge,1.449174,
2,Lasso,1.98833,3.08591
3,SVR Linear,1.465918,1.946544
4,SVR RBF,1.183737,2.746445
5,KNN,1.275011,1.63229
6,MLP,1.280703,1.954181
7,Arvore de decisao,1.50088,1.407038
8,Random Forest,1.00011,0.940968
9,GBM,1.219926,1.095765
