In [1]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Training utils
import utils_xgboost

# Optuna
import optuna
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


# VALIDACION

In [2]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates = ['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates = ['FECHAHORA'])

In [13]:
e1 = validacion[validacion['ESTACION'] == 1]

e1.FECHAHORA.min()

Timestamp('2021-03-30 13:05:00')

In [14]:
%%time

predicciones = {}
metricas = {}

for i in range(1, 11):
    
    estacion = i

    variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
                 'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
                 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

    dependent = ['AQI_MP2_5']

    number_of_features = len(variables)

    training_days = 7 
    forecast_days = 1 
    samples_per_day = 288
    step = 288/4

    # Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
    train_months = relativedelta(months = 12)
    test_months = relativedelta(months = 2, days = 20)

    input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
    output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
    train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



    X_train, y_train, X_test, y_test = utils_xgboost.get_everything(datos, 
                                                                    estacion,
                                                                    train_months, 
                                                                    variables, 
                                                                    dependent, 
                                                                    train_test_samples, 
                                                                    input_samples, 
                                                                    output_samples, 
                                                                    number_of_features,
                                                                    step)

    X_train_val, y_train_val, X_test_val, y_test_val = utils_xgboost.get_everything(validacion, 
                                                                    estacion,
                                                                    test_months, 
                                                                    variables, 
                                                                    dependent, 
                                                                    train_test_samples, 
                                                                    input_samples, 
                                                                    output_samples, 
                                                                    number_of_features,
                                                                    step)

    params = {'max_depth': 6, 
    'learning_rate': 0.016170340622682584, 
    'n_estimators': 282, 
    'min_child_weight': 10, 
    'gamma': 0.006843610407559761, 
    'subsample': 0.3168966517747982, 
    'colsample_bytree': 0.6780093701705895}

    xgb_model = xgb.XGBRegressor(** params)

    trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

    prediction = trained_xgb_model.predict(X_train_val)
    
    # guardamos los valores predecidos vs reales en un diccionario
    
    predicciones[i] = {'real' : y_train_val, 'prediccion': prediction}


    pickle.dump(trained_xgb_model, open('models/7_dias/xgboost_7dias_estacion_' + str(i) + '.pkl', 'wb'))

    mean_real = y_train_val.mean()
    mean_prediction = prediction.mean()

    MAPE = mean_absolute_percentage_error(prediction, y_train_val)
    MAE = mean_absolute_error(prediction, y_train_val)
    RMSE = mean_squared_error(prediction, y_train_val, squared = False)
    
    # guardamos las metricas en un diccionario
    
    metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}
    
    print('ESTACION '+ str(i) + ':')
    print('prediction shape: ', prediction.shape)
    print('test shape: ', y_train_val.shape)
    print('MAE :', MAE)
    print('MAPE: ', MAPE)
    print('RMSE: ', RMSE)
    print('\n')
    print('media real: ', mean_real)
    print('media predecida: ', mean_prediction)
    print('\n')


ESTACION 1:
prediction shape:  (293, 288)
test shape:  (293, 288)
MAE : 9.191548609562927
MAPE:  0.2577943573135997
RMSE:  12.146746140994281


media real:  38.92774696624953
media predecida:  38.540104


ESTACION 2:
prediction shape:  (297, 288)
test shape:  (297, 288)
MAE : 7.519554674920856
MAPE:  0.23970350284091996
RMSE:  10.651840327688603


media real:  34.75077160493827
media predecida:  30.803968


ESTACION 3:
prediction shape:  (297, 288)
test shape:  (297, 288)
MAE : 9.295321877367078
MAPE:  0.20249658600082074
RMSE:  13.689124075344148


media real:  47.019582398054624
media predecida:  44.758045


ESTACION 4:
prediction shape:  (293, 288)
test shape:  (293, 288)
MAE : 9.033276160122954
MAPE:  0.2878752987647348
RMSE:  12.733938673310673


media real:  35.244738339021616
media predecida:  32.272278


ESTACION 5:
prediction shape:  (297, 288)
test shape:  (297, 288)
MAE : 8.008050429511062
MAPE:  0.2573353400343936
RMSE:  12.29787288024107


media real:  32.78381032547699
me

In [21]:
total = 0

for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['MAPE']

print(total/10)

total = 0

for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['MAE']

print(total/10)
total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['RMSE']

print(total/10)

total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['Media real']

print(total/10)

total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['Media predecida']

print(total/10)

0.27181903072927965
9.012471100605062
13.795469505352779
36.52938434241288
33.229662132263186


# sin 9

MAPE: 0.2342610437801902

MAE: 8.344890506029175

RMSE: 12.87075390577948

MEDIA REAL: 34.661186579803854

MEDIA PREDECIDA: 31.304756736755373

# con 9


MAPE: 0.27181903072927965

MAE: 9.012471100605062

RMSE: 13.795469505352779

MEDIA REAL: 36.52938434241288

MEDIA PREDECIDA: 33.229662132263186


In [25]:
df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metricas_10estaciones_7dias.csv')

In [27]:
df_predicciones = pd.DataFrame.from_dict(predicciones)

df_predicciones.to_csv('predicciones_10estaciones_7dias.csv')

In [37]:



%%time


# Hiperparametros Optuna parado

# params = {'max_depth': 6, 
#           'learning_rate': 0.010049185067138871, 
#           'n_estimators': 294, 
#           'min_child_weight': 6, 
#           'gamma': 0.0012980577270314173, 
#           'subsample': 0.18828660906502742, 
#           'colsample_bytree': 0.9488588949410688}

#Hiperparametros ejemplo

# params = {'learning_rate' : 0.025,
#           'n_estimators' : 250,
#           'max_depth': 2,
#           'min_child_weight' : 1,
#           'gamma': 0.0,
#           'subsample': 0.98,
#           'colsample_bytree': 0.98,
#           'scale_pos_weight': 0.8,
#           'seed': 42,
#           'verbosity' : 0}


# Hiperparametros optuna completado



prediction (213, 288)
test (213, 288)
RMSE:  10.866205543857895
MAE:  7.2546269617465615
CPU times: total: 1h 9min 33s
Wall time: 6min 31s


28.150202138758477
27.673008


0.27459897048248555

# Valores para parametros de optuna incompleto

* media real: 28.150202138758477
* media predecida: 27.714634


* RMSE:  10.886802850195714
* MAE:  7.321197804384202
* MAPE: 0.2760916690053607

# Valores para parametros de optuna terminado

* 28.150202138758477
* 27.673008

* RMSE:  10.866205543857895
* MAE:  7.2546269617465615
* MAPE: 0.27459897048248555

# Valores para parametros del paper

* Media real: 28.150202138758477
* Media predecida: 29.366629

* RMSE:  10.724885901102555
* MAE:  7.297981035549254

* MAPE: 0.2604501639385927

# Arreglos para graficar

In [41]:
pred_1 = []


prediction = predicciones[4]['prediccion']
y_real = predicciones[4]['real']


for i in range(3, len(prediction), 4):
    pred_1.append(prediction[i])
    
pred_1 = np.asarray(pred_1)
    
test_1 = []

for i in range(3, len(y_real), 4):
    test_1.append(y_real[i])
    
test_1 = np.asarray(test_1)

print(pred_1.shape)
print(test_1.shape)

(73, 288)
(73, 288)


In [31]:
y_pred = np.reshape(pred_1, ( len(pred_1) * len(pred_1[0])))
y_test_1 = np.reshape(test_1, (len(test_1) * len(test_1[0])))

print(y_pred)
print(y_test_1)

fig_val = go.Figure()

fig_val.add_trace(
    go.Scatter( y = list(y_test_1), name = 'Target'))

fig_val.add_trace(
     go.Scatter( y = list(y_pred), name = 'Forecasts'))

fig_val.update_layout( title_text = "Validation - Forecasts vs Targets")

[ 6.3325353  6.6212983  6.451699  ... 49.09152   48.08271   52.00315  ]
[ 4.  4.  4. ... 61. 61. 61.]
