In [1]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Training utils
import utils_xgboost

# Optuna
import optuna
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


# VALIDACION

In [2]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates = ['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates = ['FECHAHORA'])

In [13]:
e1 = validacion[validacion['ESTACION'] == 1]

e1.FECHAHORA.min()

Timestamp('2021-03-30 13:05:00')

In [5]:
%%time

predicciones = {}
metricas = {}

for i in range(1, 11):
    
    estacion = i

    variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
                 'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
                 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

    dependent = ['AQI_MP2_5']

    number_of_features = len(variables)

    training_days = 15 
    forecast_days = 1 
    samples_per_day = 288
    step = 288/4

    # Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
    train_months = relativedelta(months = 12)
    test_months = relativedelta(months = 2)

    input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
    output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
    train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



    X_train, y_train, X_test, y_test = utils_xgboost.get_everything(datos, 
                                                                    estacion,
                                                                    train_months, 
                                                                    variables, 
                                                                    dependent, 
                                                                    train_test_samples, 
                                                                    input_samples, 
                                                                    output_samples, 
                                                                    number_of_features,
                                                                    step)

    X_train_val, y_train_val, X_test_val, y_test_val = utils_xgboost.get_everything(validacion, 
                                                                    estacion,
                                                                    test_months, 
                                                                    variables, 
                                                                    dependent, 
                                                                    train_test_samples, 
                                                                    input_samples, 
                                                                    output_samples, 
                                                                    number_of_features,
                                                                    step)

    params = {'max_depth': 5,
        'learning_rate': 0.03501312736977673,
        'n_estimators': 78,
        'min_child_weight': 8,
        'gamma': 0.00590507825523195,
        'subsample': 0.577093202211148,
        'colsample_bytree': 0.7936061869990182} 

    xgb_model = xgb.XGBRegressor(** params)

    trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

    prediction = trained_xgb_model.predict(X_train_val)
    
    # guardamos los valores predecidos vs reales en un diccionario
    
    predicciones[i] = {'real' : y_train_val, 'prediccion': prediction}


    pickle.dump(trained_xgb_model, open('models/15_dias/xgboost_15dias_estacion_' + str(i) + '.pkl', 'wb'))

    mean_real = y_train_val.mean()
    mean_prediction = prediction.mean()

    MAPE = mean_absolute_percentage_error(prediction, y_train_val)
    MAE = mean_absolute_error(prediction, y_train_val)
    RMSE = mean_squared_error(prediction, y_train_val, squared = False)
    
    # guardamos las metricas en un diccionario
    
    metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}
    
    print('ESTACION '+ str(i) + ':')
    print('prediction shape: ', prediction.shape)
    print('test shape: ', y_train_val.shape)
    print('MAE :', MAE)
    print('MAPE: ', MAPE)
    print('RMSE: ', RMSE)
    print('\n')
    print('media real: ', mean_real)
    print('media predecida: ', mean_prediction)
    print('\n')


ESTACION 1:
prediction shape:  (181, 288)
test shape:  (181, 288)
MAE : 9.07459094199284
MAPE:  0.24920526103978435
RMSE:  12.081161151954964


media real:  39.731545426642114
media predecida:  38.084217


ESTACION 2:
prediction shape:  (185, 288)
test shape:  (185, 288)
MAE : 8.004491046751227
MAPE:  0.24847663169883047
RMSE:  10.984763901645387


media real:  36.719744744744744
media predecida:  32.007977


ESTACION 3:
prediction shape:  (185, 288)
test shape:  (185, 288)
MAE : 8.542350107628303
MAPE:  0.2001734279809314
RMSE:  13.297459320963764


media real:  44.344162912912914
media predecida:  42.414368


ESTACION 4:
prediction shape:  (181, 288)
test shape:  (181, 288)
MAE : 7.5682478102714175
MAPE:  0.2712542333653583
RMSE:  10.437881993920819


media real:  30.157957335788826
media predecida:  28.695885


ESTACION 5:
prediction shape:  (185, 288)
test shape:  (185, 288)
MAE : 6.857626936833063
MAPE:  0.24144541954360724
RMSE:  11.121637309073655


media real:  29.7620120120120

In [7]:
total = 0

for i in [1,2,3,4,5,6,7,8,9, 10]:
    total = total + metricas[i]['MAPE']

print(total/10)

total = 0

for i in [1,2,3,4,5,6,7,8,9, 10]:
    total = total + metricas[i]['MAE']

print(total/10)
total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['RMSE']

print(total/10)

total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['Media real']

print(total/10)

total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['Media predecida']

print(total/10)

0.25622622663311423
7.730734901717618
11.310733923177288
33.42030836776003
31.311512756347657


# sin 9

MAPE: 0.21795875942627668

MAE: 7.105604567160746

RMSE: 10.385880611837369

MEDIA REAL: 31.856335071259103

MEDIA PREDECIDA: 29.634017372131346


# con 9


MAPE: 0.25622622663311423

MAE: 7.730734901717618

RMSE: 11.310733923177288

MEDIA REAL: 33.42030836776003

MEDIA PREDECIDA: 31.311512756347657




In [9]:
df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metricas_10estaciones_15dias.csv')

In [10]:
df_predicciones = pd.DataFrame.from_dict(predicciones)

df_predicciones.to_csv('predicciones_10estaciones_15dias.csv')

In [37]:



%%time


# Hiperparametros Optuna parado

# params = {'max_depth': 6, 
#           'learning_rate': 0.010049185067138871, 
#           'n_estimators': 294, 
#           'min_child_weight': 6, 
#           'gamma': 0.0012980577270314173, 
#           'subsample': 0.18828660906502742, 
#           'colsample_bytree': 0.9488588949410688}

#Hiperparametros ejemplo

# params = {'learning_rate' : 0.025,
#           'n_estimators' : 250,
#           'max_depth': 2,
#           'min_child_weight' : 1,
#           'gamma': 0.0,
#           'subsample': 0.98,
#           'colsample_bytree': 0.98,
#           'scale_pos_weight': 0.8,
#           'seed': 42,
#           'verbosity' : 0}


# Hiperparametros optuna completado



prediction (213, 288)
test (213, 288)
RMSE:  10.866205543857895
MAE:  7.2546269617465615
CPU times: total: 1h 9min 33s
Wall time: 6min 31s


In [21]:
predicciones[3]['prediccion']

array([[23.4666  , 23.559698, 23.425415, ..., 32.73254 , 33.927082,
        32.362026],
       [23.4666  , 23.559698, 23.425415, ..., 26.478579, 26.60877 ,
        27.700651],
       [30.943655, 30.909687, 30.864601, ..., 33.85464 , 34.198826,
        35.768955],
       ...,
       [51.581924, 51.57514 , 51.54586 , ..., 49.42132 , 53.542816,
        54.48776 ],
       [43.11948 , 42.895557, 42.75983 , ..., 40.667854, 42.47364 ,
        39.279293],
       [35.642155, 35.64154 , 35.42578 , ..., 44.24609 , 42.982586,
        42.39735 ]], dtype=float32)

# Valores para parametros de optuna incompleto

* media real: 28.150202138758477
* media predecida: 27.714634


* RMSE:  10.886802850195714
* MAE:  7.321197804384202
* MAPE: 0.2760916690053607

# Valores para parametros de optuna terminado

* 28.150202138758477
* 27.673008

* RMSE:  10.866205543857895
* MAE:  7.2546269617465615
* MAPE: 0.27459897048248555

# Valores para parametros del paper

* Media real: 28.150202138758477
* Media predecida: 29.366629

* RMSE:  10.724885901102555
* MAE:  7.297981035549254

* MAPE: 0.2604501639385927

# Arreglos para graficar

In [50]:
pred_1 = []

prediction = predicciones[7]['prediccion']
y_real = predicciones[7]['real']

for i in range(4, len(prediction), 4):
    pred_1.append(prediction[i])
    
pred_1 = np.asarray(pred_1)
    
test_1 = []

for i in range(4, len(y_real), 4):
    test_1.append(y_real[i])
    
test_1 = np.asarray(test_1)

print(pred_1.shape)
print(test_1.shape)

(46, 288)
(46, 288)


In [51]:
y_pred = np.reshape(pred_1, ( len(pred_1) * len(pred_1[0])))
y_test_1 = np.reshape(test_1, (len(test_1) * len(test_1[0])))

print(y_pred)
print(y_test_1)

fig_val = go.Figure()

fig_val.add_trace(
    go.Scatter( y = list(y_test_1), name = 'Target'))

fig_val.add_trace(
     go.Scatter( y = list(y_pred), name = 'Forecasts'))

fig_val.update_layout( title_text = "Validation - Forecasts vs Targets")

[30.962622 31.09025  30.959364 ... 32.392662 33.269173 30.861792]
[33. 33. 33. ... 12. 12. 12.]
