In [1]:
# Directories
import os

new_directory = r'c://Users//Fer//TESIS_ARCHIVOS//TESIS_AIRE//MP_Forecasting//aqi_forecasting//notebooks'
os.chdir(new_directory)

# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Training utils
from training_code.utils import utils_xgboost

# Optuna
import optuna
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
datos = pd.read_csv('datos/230325_train_resampled_1h.csv', parse_dates= ['FECHAHORA'])
validacion = pd.read_csv('datos/230325_test_resampled_1h.csv', parse_dates=['FECHAHORA'])
validacion.head()

Unnamed: 0,FECHAHORA,ESTACION,MP1,MP2_5,MP10,TEMPERATURA,HUMEDAD,PRESION,TRAFICO,TIPO,DIA_SEM,AQI_MP2_5,AQI_MP10,HORA,ANHO,MES,MINUTO,FECHAHORA.1,DIA
0,2021-03-30 13:00:00,1.0,0.892273,1.095545,2.162,28.190909,50.354545,1000.818182,1.0,1.0,1.0,53.0,14.0,13,2021,3,0,2021-03-30 13:00:00,30
1,2021-03-30 14:00:00,1.0,0.9125,1.162333,1.69275,28.1,50.083333,1000.225,1.0,1.0,1.0,53.0,14.0,14,2021,3,0,2021-03-30 14:00:00,30
2,2021-03-30 15:00:00,1.0,0.7595,0.94575,1.310917,27.808333,51.083333,999.841667,1.0,1.0,1.0,53.0,14.0,15,2021,3,0,2021-03-30 15:00:00,30
3,2021-03-30 16:00:00,1.0,1.027667,1.301083,2.559167,26.766667,54.433333,999.658333,2.0,2.0,1.0,53.0,13.5,16,2021,3,0,2021-03-30 16:00:00,30
4,2021-03-30 17:00:00,1.0,2.579917,3.206083,5.1625,24.808333,65.391667,999.641667,2.0,2.0,1.0,53.0,13.0,17,2021,3,0,2021-03-30 17:00:00,30


In [16]:
%%time

predicciones = {}
metricas = {}

for i in range(1, 11):
    
    estacion = i

    variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
                 'TEMPERATURA', 'HUMEDAD', 'PRESION', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

    dependent = ['AQI_MP2_5']

    number_of_features = len(variables)

    training_days = 7 
    forecast_days = 1/4
    samples_per_day = 24
    step = 24/4

    # Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
    train_months = relativedelta(months = 12)

    input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
    output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
    train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



    X_train, y_train, X_test, y_test = utils_xgboost.get_everything(datos, 
                                                                    estacion,
                                                                    train_months, 
                                                                    variables, 
                                                                    dependent, 
                                                                    train_test_samples, 
                                                                    input_samples, 
                                                                    output_samples, 
                                                                    number_of_features,
                                                                    step)

    X_train_val, y_train_val = utils_xgboost.get_validation(validacion, 
                                                                    estacion,
                                                                    variables, 
                                                                    dependent, 
                                                                    train_test_samples, 
                                                                    input_samples, 
                                                                    output_samples, 
                                                                    number_of_features,
                                                                    step)

    params={'max_depth': 1, 
    'learning_rate': 0.14282255288480264, 
    'n_estimators': 119, 
    'min_child_weight': 10, 
    'gamma': 4.6765240256450094e-05, 
    'subsample': 0.7918596491329176, 
    'colsample_bytree': 0.995987995143913}

    xgb_model = xgb.XGBRegressor(** params)

    trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

    prediction = trained_xgb_model.predict(X_train_val)
    
    # guardamos los valores predecidos vs reales en un diccionario
    
    predicciones[i] = {'real' : y_train_val, 'prediccion': prediction}


    pickle.dump(trained_xgb_model, open('models/models_xgboost/resample_1h/xgboost_NUEVOSPARAM_6hs_estacion_' + str(i) + '.pkl', 'wb'))

    mean_real = y_train_val.mean()
    mean_prediction = prediction.mean()

    MAPE = mean_absolute_percentage_error(prediction, y_train_val)
    MAE = mean_absolute_error(prediction, y_train_val)
    RMSE = mean_squared_error(prediction, y_train_val, squared = False)
    
    # guardamos las metricas en un diccionario
    
    metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}
    
    print('ESTACION '+ str(i) + ':')
    print('prediction shape: ', prediction.shape)
    print('test shape: ', y_train_val.shape)
    print('MAE :', MAE)
    print('MAPE: ', MAPE)
    print('RMSE: ', RMSE)
    print('\n')
    print('media real: ', mean_real)
    print('media predecida: ', mean_prediction)
    print('\n')

ESTACION 1:
prediction shape:  (340, 6)
test shape:  (340, 6)
MAE : 3.976003367682687
MAPE:  0.12177724558047398
RMSE:  6.023007830616088


media real:  37.458047385620915
media predecida:  38.15673


ESTACION 2:
prediction shape:  (340, 6)
test shape:  (340, 6)
MAE : 2.5330345615062844
MAPE:  0.08253347238549345
RMSE:  3.7357649939645423


media real:  34.08039215686274
media predecida:  33.870647


ESTACION 3:
prediction shape:  (340, 6)
test shape:  (340, 6)
MAE : 3.150089027523215
MAPE:  0.07234146633590131
RMSE:  4.891610241099946


media real:  46.45196078431373
media predecida:  46.485218


ESTACION 4:
prediction shape:  (340, 6)
test shape:  (340, 6)
MAE : 3.1561335071239593
MAPE:  0.10395902252317601
RMSE:  4.500030167879873


media real:  35.51629901960784
media predecida:  34.81025


ESTACION 5:
prediction shape:  (340, 6)
test shape:  (340, 6)
MAE : 2.768331974472096
MAPE:  0.09016732142947519
RMSE:  4.743890277261255


media real:  32.33631535947711
media predecida:  31.97

In [17]:
total = 0

for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['MAPE']

print(total/9)

total = 0

for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['MAE']

print(total/9)
total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['RMSE']

print(total/9)

total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['Media real']

print(total/9)

total = 0
    
for i in [1,2,3,4,5,6,7,8,9,10]:
    total = total + metricas[i]['Media predecida']

print(total/10)

0.11558865252791531
3.8043006512507542
6.611742746595949
40.52059731299928
35.93151721954346


In [18]:
df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/XGBOOST/metricas_10estaciones_nuevosparam_RESAMPLED_6hs.csv')

In [19]:
list_dfs = []

for i in range(1,11):
    d = {'TARGET': predicciones[i]['real'].flatten(), 'FORECAST': predicciones[i]['prediccion'].flatten()}
    df_aux = pd.DataFrame(data = d)
    df_aux['ESTACION'] = i
    list_dfs.append(df_aux)

df_predicciones = pd.concat(list_dfs)

df_predicciones.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20400 entries, 0 to 2039
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   TARGET    20400 non-null  float64
 1   FORECAST  20400 non-null  float32
 2   ESTACION  20400 non-null  int64  
dtypes: float32(1), float64(1), int64(1)
memory usage: 557.8 KB


In [20]:
df_predicciones.to_csv('datos/predicciones_10estaciones_7dias_nuevosparam_resampled_6hs.csv')

In [21]:
for i in range(1,11):

    fig_val = go.Figure()

    df_grafica = df_predicciones[df_predicciones['ESTACION'] == i]

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.TARGET), name = 'Target'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST), name = 'Forecasts'))

    fig_val.update_layout( title_text = "Validation - Forecasts vs Targets")

    fig_val.write_html('graphs/XGBOOST/RESAMPLED/XGBOOST_nuevosparam_6hs_estacion_'+str(i)+'_targets_vs_forecasts.html')