Trabalho 2

Thiago Bruschi Martins RA: 120212

Objetivo do trabalho: aplicar os vários regressores visto em aula. Busca de hiperparametros.

In [None]:
import io
import random
import requests
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.svm import SVR
from sklearn.svm import LinearSVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from scipy.stats import loguniform, uniform
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV


In [None]:
import warnings
warnings.filterwarnings("ignore")

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00514/Bias_correction_ucl.csv"
s = requests.get(url).content
df = pd.read_csv(io.StringIO(s.decode('utf-8')))


# Pré-processamento

In [None]:
random.seed(41)

pre_df = df.dropna()
X = pre_df.drop(['Next_Tmin','Date','Next_Tmax'], axis=1)
y = pre_df['Next_Tmax']

scaler = StandardScaler()
X = scaler.fit_transform(X)

print('X:', X.shape)
print('y:', y.shape)

X: (7588, 22)
y: (7588,)


# Funções

In [None]:
# Testa 10 valores de parâmetros aleatórios no estimator recebido
def random_search(estimator, param_distributions):
  rscv = RandomizedSearchCV(estimator, param_distributions=param_distributions, scoring='neg_root_mean_squared_error', n_iter=10, n_jobs=-1)
  r = rscv.fit(X,y)
  return -r.best_score_, r.best_params_ 

# Realiza o cross_validation com os valores padrões do estimator recebido
def cross_validation(estimator):
  r = cross_val_score(estimator, X, y, cv=5, scoring='neg_root_mean_squared_error')
  return np.min(-r)

def list_dict(d):
    for x in d:
        print (f'\t{x}: {round(d[x],5)}')     

def print_dict(d):
    for x in d:
        print (x)
        for y in d[x]:
            print ('\t',y,':', round(d[x][y], 4))

# Testa um modelo com os parâmetros recebidos e com os valores default
def model_test(estimator, params, name):
  best_score, best_params = random_search(estimator, params)
  default_score =  cross_validation(estimator)

  results[name] = {}
  results[name]['best_score'] = best_score
  results[name]['default_score'] = default_score
  return best_params

def model_report(name, best_params):
  print(name)
  list_dict(best_params)
  list_dict(results[name])

results = {}

# Modelos de Regressão

## Modelos Lineares

In [None]:
results['LR'] = {}
validation = cross_val_score(LinearRegression(), X, y, scoring='neg_root_mean_squared_error')
results['LR']['default_score'] = np.round(np.min(-validation),3) # melhor resultado
results['LR']['best_score'] = results['LR']['default_score']

print('Linear Regression:', results['LR']['default_score'])

Linear Regression: 1.454


### L1

In [None]:
lasso_param = {
    'alpha':loguniform(10e-3, 10e3)
}

best_params = model_test(Lasso(), lasso_param, 'L1')
model_report('L1', best_params)

L1
	alpha: 0.09272
	best_score: 1.58284
	default_score: 1.73604


### L2

In [None]:

ridge_param = {
    'alpha':loguniform(10e-3, 10e3)
}

best_params = model_test(Ridge(), ridge_param, 'L2')
model_report('L2', best_params)

L2
	alpha: 7.32624
	best_score: 1.57706
	default_score: 1.45418


### SVM Linear

In [None]:
svm_param = {
    'epsilon':[0.1, 0.3],
    'C': loguniform(2e-15,2e15)
    }

best_params = model_test(LinearSVR(), svm_param, 'SVM_Linear')
model_report('SVM_Linear', best_params)

SVM_Linear
	C: 351491174671.98975
	epsilon: 0.3
	best_score: 1.99365
	default_score: 1.42496


## SVM com kernel RBF

In [None]:
svm_param = {
    'epsilon':[0.1, 0.3],
    'C':loguniform(2e-15,2e15),
    'gamma':loguniform(2e-9,2e3)
    }
 
best_params = model_test(SVR('rbf', max_iter=1000), svm_param, 'SVM_RBF')
model_report('SVM_RBF', best_params)

SVM_RBF
	C: 25.78587
	epsilon: 0.3
	gamma: 7e-05
	best_score: 1.8865
	default_score: 1.44551


## KNN

In [None]:
knn_params = {
    'n_neighbors':np.random.randint(1, 1000, 10)
    }

best_params = model_test(KNeighborsRegressor(), knn_params, 'KNN')
model_report('KNN', best_params)

KNN
	n_neighbors: 213
	best_score: 1.908
	default_score: 1.63376


## MLP

In [None]:
MLP_params = {
    'hidden_layer_sizes':(5,8,11,14,17,20,23)
    }

best_params = model_test(MLPRegressor(), MLP_params, 'MLP')
model_report('MLP', best_params)

MLP
	hidden_layer_sizes: 20
	best_score: 2.33829
	default_score: 1.89351


## Decision Tree

In [None]:
dtree_params = {
    'ccp_alpha':[random.uniform(0.0, 0.4) for i in range(10)]
    }

best_params = model_test(DecisionTreeRegressor(), dtree_params, 'D-Tree')
model_report('D-Tree', best_params)

D-Tree
	ccp_alpha: 0.06641
	best_score: 1.83701
	default_score: 2.16191


## Random Forest

In [None]:
rf_params = {
    'n_estimators':[10, 100, 1000],
    'max_features':[5, 10, 22]
    }

best_params = model_test(RandomForestRegressor(), rf_params, 'RF')
model_report('RF', best_params)

RF
	n_estimators: 1000
	max_features: 10
	best_score: 1.62588
	default_score: 1.42496


## GBM

In [None]:
gbm_params = {
    'n_estimators': np.random.randint(5, 100, 10),
    'max_features':[random.uniform(0.01, 0.3) for i in range(10)],
    'max_depth':[2, 3]
    }

best_params = model_test(GradientBoostingRegressor(), gbm_params, 'GBM')
model_report('GBM', best_params)

GBM
	n_estimators: 57
	max_features: 0.19002
	max_depth: 2
	best_score: 1.70968
	default_score: 1.43851


# Final Report

In [None]:
# Ordena e exibe os modelos pelo melhor resultado alcançado de cada um
final_report = pd.DataFrame.from_dict(results, orient='index')
final_report.columns=['best_params','default_params']
final_report['best_score'] = final_report.min(axis=1)
print(final_report.sort_values(by='best_score'))

            best_params  default_params  best_score
SVM_Linear     1.993645        1.424957    1.424957
RF             1.625883        1.424964    1.424964
GBM            1.709675        1.438511    1.438511
SVM_RBF        1.886496        1.445507    1.445507
LR             1.454000        1.454000    1.454000
L2             1.577059        1.454183    1.454183
L1             1.582843        1.736037    1.582843
KNN            1.908000        1.633757    1.633757
D-Tree         1.837005        2.161914    1.837005
MLP            2.338287        1.893508    1.893508
