In [1]:
import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

cm = plt.cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import GridSearchCV


pd.set_option('display.max_rows', None) # para mostrar todas las filas
pd.set_option('display.max_columns', None) # para mostrar todas las columnas

In [2]:
# Importamos los datos
X_train = pd.read_csv("./DatosPisaSinNan/X_train.csv", sep=',', decimal='.')
y_train = pd.read_csv("./DatosPisaSinNan/y_train.csv", sep=',', decimal='.')
X_val = pd.read_csv("./DatosPisaSinNan/X_val.csv", sep=',', decimal='.')
y_val = pd.read_csv("./DatosPisaSinNan/y_val.csv", sep=',', decimal='.')
X_test = pd.read_csv("./DatosPisaSinNan/X_test.csv", sep=',', decimal='.')
y_test = pd.read_csv("./DatosPisaSinNan/y_test.csv", sep=',', decimal='.')

Pasamos los dataframes a numpy array

In [3]:
feature_names = X_train.columns[:]

X_train = X_train.values
y_train = y_train.values



### Estandarización de datos

Escalamos con los datos de train

In [4]:
from sklearn import preprocessing

scaler = preprocessing.StandardScaler().fit(X_train)
#scaler = preprocessing.Normalizer().fit(X_train)
XtrainScaled = scaler.transform(X_train)
XvalScaled = scaler.transform(X_val)
XtestScaled = scaler.transform(X_test)

### Random Forest

In [5]:
# Otro método que da mejores resultados: Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import median_absolute_error

rf = RandomForestRegressor(oob_score = True)
rf.fit(XtrainScaled, y_train)

y_predict = rf.predict(XtestScaled)
rf_testing_set_score = rf.score(XtestScaled, y_test)
rf_median_abs_error = median_absolute_error(y_test, y_predict)
rf_mean_abs_error = mean_absolute_error(y_test, y_predict)
print('R^2 en datos de test: ' + str(round(rf_testing_set_score,3)))
print('Mediana del error en datos de test: ' + str(round(rf_median_abs_error,3)))
print('Media del error en datos de test: ' + str(round(rf_mean_abs_error,3)))

R^2 en datos de test: 0.561
Mediana del error en datos de test: 35.005
Media del error en datos de test: 41.31


In [9]:
# Estos parámetros producian overfitting, el MAE de train baja a 15 mientras que el de test se queda en 41
# tuned_parameters = {
#     "n_estimators": [100, 500, 1000, 2000]
# }
# rf_tuned = GridSearchCV(RandomForestRegressor(), cv = 3, param_grid = tuned_parameters)

param_grid = {"max_depth": range(2,6), "min_samples_split": range(4,8,2), "n_estimators": range(100, 500, 200)}
# Mejores parámetros: {'max_depth': 5, 'min_samples_split': 6, 'n_estimators': 100}

rf_tuned = GridSearchCV(RandomForestRegressor(), cv = 5, param_grid = param_grid)

preds = rf_tuned.fit(XtrainScaled, y_train)
best = rf_tuned.best_estimator_ 
y_predict = rf_tuned.predict(XtestScaled)
rft_testing_set_score = rf_tuned.score(XtestScaled, y_test)
rft_median_abs_error = median_absolute_error(y_test, y_predict)
rf_mean_abs_error = mean_absolute_error(y_test, y_predict)

print('Mejores parámetros:', rf_tuned.best_params_)
print('R^2 en datos de test: ' + str(round(rf_testing_set_score,3)))
print('Mediana del error en datos de test: ' + str(round(rf_median_abs_error,3)))
print('Media del error en datos de test: ' + str(round(rf_mean_abs_error,3)))

# tarda 2 minutos en entrenar

Mejores parámetros: {'max_depth': 5, 'min_samples_split': 6, 'n_estimators': 100}
R^2 en datos de test: 0.561
Mediana del error en datos de test: 35.005
Media del error en datos de test: 43.296


In [11]:
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error


ytrainrf_tuned = rf_tuned.predict(XtrainScaled)
ytestrf_tuned  = rf_tuned.predict(XtestScaled)
mseTrainModelrf_tuned = mean_squared_error(y_train,ytrainrf_tuned)
mseTestModelrf_tuned = mean_squared_error(y_test,ytestrf_tuned)
print('MSE Modelo RandomForestRegressor StandardScaler (train): %0.5g' % mseTrainModelrf_tuned)
print('MSE Modelo RandomForestRegressor StandardScaler (test) : %0.5g' % mseTestModelrf_tuned)

print()

maeTrainModelrf_tuned = mean_absolute_error(y_train,ytrainrf_tuned)
maeTestModelrf_tuned  = mean_absolute_error(y_test,ytestrf_tuned)

print('MAE Modelo RandomForestRegressor StandardScaler (train): %0.5g' % maeTrainModelrf_tuned)
print('MAE Modelo RandomForestRegressor StandardScaler (test) : %0.5g' % maeTestModelrf_tuned)

MSE Modelo RandomForestRegressor (train): 2781.8
MSE Modelo RandomForestRegressor (test) : 2946

MAE Modelo RandomForestRegressor (train): 41.826
MAE Modelo RandomForestRegressor (test) : 43.296


In [10]:
# Salvamos el modelo en formato joblib

from joblib import dump, load

dump(rf_tuned, 'modeloRandomForest_st.joblib') 

clf = load('modeloRandomForest_st.joblib') 


### Conclusión

Controlando los parámetros 'max_depth' y 'min_samples_split' conseguimos controlar el overfitting y por medio del parámetro 'n_estimators' mejoramos el ajuste del algoritmo.

Los resultados obtenidos son:
- Un MAE relativamente bueno de 43.296 
- Un control del overfitting mediante los parámetros encontrados.