In [1]:
# Directories
import os

new_directory = r'c://Users//Fer//TESIS_ARCHIVOS//TESIS_AIRE//MP_Forecasting//aqi_forecasting//notebooks'
os.chdir(new_directory)

# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation


# Training utils
from training_code.utils import utils_xgboost

# Optuna
import optuna
import joblib
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates=['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])

In [3]:
lista = [datos, validacion]

df = pd.concat(lista)

df.reset_index(inplace = True, drop = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1580842 entries, 0 to 1580841
Data columns (total 28 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   ESTACION                1580842 non-null  int64         
 1   FECHAHORA               1580842 non-null  datetime64[ns]
 2   ANHO                    1580842 non-null  int64         
 3   DIA                     1580842 non-null  int64         
 4   MES                     1580842 non-null  int64         
 5   HORA                    1580842 non-null  int64         
 6   MINUTO                  1580842 non-null  int64         
 7   MP1                     1580842 non-null  float64       
 8   MP2_5                   1580842 non-null  float64       
 9   MP10                    1580842 non-null  float64       
 10  TEMPERATURA             1580842 non-null  float64       
 11  HUMEDAD                 1580842 non-null  float64       
 12  PRESION       

# 6 horas - Mejores parámetros

In [5]:
predicciones = {}
metricas = {}

estacion = 4

variables = ['MP1', 'MP2_5','AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 2 
forecast_days = 1/4
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 15)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



X_train, y_train, X_test, y_test = utils_xgboost.get_everything(df, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)


study = joblib.load('optuna_studies/XGBOOST/experimento_full/study_XGBOOST_mejoresparam40_resample_5_horizon_0.25_previous_2.pkl')

params = study.best_params

xgb_model = xgb.XGBRegressor(** params)

trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

prediction = trained_xgb_model.predict(X_test)

# guardamos los valores predecidos vs reales en un diccionario

predicciones[4] = {'real' : y_test, 'prediccion': prediction}

mean_real = y_test.mean()
mean_prediction = prediction.mean()

MAPE = mean_absolute_percentage_error(prediction, y_test) * 100
MAE = mean_absolute_error(prediction, y_test)
RMSE = mean_squared_error(prediction, y_test, squared = False)

# guardamos las metricas en un diccionario

metricas[4] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/XGBOOST/experimento_full/validation_xgboost_station4_5_0.25_2.csv')

list_dfs = []


d = {'TARGET': predicciones[4]['real'].flatten(), 'FORECAST': predicciones[4]['prediccion'].flatten()}
df_predicciones = pd.DataFrame(data = d)
df_predicciones['ESTACION'] = 4

df_predicciones.to_csv('datos/experimento_full/predicciones_validation_xgboost_station4_5_0.25_2.csv')

print('ESTACION 4:')
print('prediction shape: ', prediction.shape)
print('test shape: ', y_test.shape)
print('MAE :', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)
print('\n')
print('media real: ', mean_real)
print('media predecida: ', mean_prediction)
print('\n')


ESTACION 4:
prediction shape:  (360, 72)
test shape:  (360, 72)
MAE : 2.4966296529696312
MAPE:  8.621785901854393
RMSE:  3.7186393866868936


media real:  34.82507716049383
media predecida:  34.54925




# 12 horas - mejores parametros

In [6]:
predicciones = {}
metricas = {}

estacion = 4

variables = ['MP1', 'MP2_5','AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 2 
forecast_days = 1/2
samples_per_day = 288
step = 288/2

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 15)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



X_train, y_train, X_test, y_test = utils_xgboost.get_everything(df, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)


study = joblib.load('optuna_studies/XGBOOST/experimento_full/study_XGBOOST_mejoresparam40_resample_5_horizon_0.5_previous_2.pkl')

params = study.best_params

xgb_model = xgb.XGBRegressor(** params)

trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

prediction = trained_xgb_model.predict(X_test)

# guardamos los valores predecidos vs reales en un diccionario

predicciones[4] = {'real' : y_test, 'prediccion': prediction}

mean_real = y_test.mean()
mean_prediction = prediction.mean()

MAPE = mean_absolute_percentage_error(prediction, y_test) * 100
MAE = mean_absolute_error(prediction, y_test)
RMSE = mean_squared_error(prediction, y_test, squared = False)

# guardamos las metricas en un diccionario

metricas[4] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/XGBOOST/experimento_full/validation_xgboost_station4_5_0.5_2.csv')

list_dfs = []


d = {'TARGET': predicciones[4]['real'].flatten(), 'FORECAST': predicciones[4]['prediccion'].flatten()}
df_predicciones = pd.DataFrame(data = d)
df_predicciones['ESTACION'] = 4

df_predicciones.to_csv('datos/experimento_full/predicciones_validation_xgboost_station4_5_0.5_2.csv')

print('ESTACION 4:')
print('prediction shape: ', prediction.shape)
print('test shape: ', y_test.shape)
print('MAE :', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)
print('\n')
print('media real: ', mean_real)
print('media predecida: ', mean_prediction)
print('\n')

ESTACION 4:
prediction shape:  (180, 144)
test shape:  (180, 144)
MAE : 4.3810871707436485
MAPE:  14.148590899303212
RMSE:  6.290828814369543


media real:  34.82507716049383
media predecida:  34.19886




# 24 horas - mejores parametros

In [7]:
df_train = df.copy()

df_train.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
            'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
            'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

df_train = df_train.set_index('FECHAHORA', drop = True)

lista_resample = []

r_i = '30T'

for station in range(1,11):
    df_aux = df_train[df_train['ESTACION'] == station]

    df_aux = df_aux.resample(r_i).mean()
    lista_resample.append(df_aux)

df_train = pd.concat(lista_resample)

df_train['ANHO'] = df_train.index.year
df_train['MES'] = df_train.index.month
df_train['HORA'] = df_train.index.hour
df_train['MINUTO'] = df_train.index.minute
df_train['DIA'] = df_train.index.day
df_train['FECHAHORA'] = df_train.index

df_train.reset_index(inplace = True, drop = True)

predicciones = {}
metricas = {}

estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 'AQI_MP10', 'AQI_MP2_5', 'TIPO', 'TRAFICO', 'HUMEDAD', 
                        'PRESION', 'TEMPERATURA','DIA_SEM']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 2 
forecast_days = 1
samples_per_day = 48
step = 48

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 15)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



X_train, y_train, X_test, y_test = utils_xgboost.get_everything(df_train, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)


study = joblib.load('optuna_studies/XGBOOST/experimento_full/study_XGBOOST_resample_30_horizon_1_previous_2.pkl')

params = study.best_params

xgb_model = xgb.XGBRegressor(** params)

trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

prediction = trained_xgb_model.predict(X_test)

# guardamos los valores predecidos vs reales en un diccionario

predicciones[4] = {'real' : y_test, 'prediccion': prediction}

mean_real = y_test.mean()
mean_prediction = prediction.mean()

MAPE = mean_absolute_percentage_error(prediction, y_test) * 100
MAE = mean_absolute_error(prediction, y_test)
RMSE = mean_squared_error(prediction, y_test, squared = False)

# guardamos las metricas en un diccionario

metricas[4] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/XGBOOST/experimento_full/validation_xgboost_station4_5_1_2.csv')

list_dfs = []


d = {'TARGET': predicciones[4]['real'].flatten(), 'FORECAST': predicciones[4]['prediccion'].flatten()}
df_predicciones = pd.DataFrame(data = d)
df_predicciones['ESTACION'] = 4

df_predicciones.to_csv('datos/experimento_full/predicciones_validation_xgboost_station4_5_1_2.csv')

print('ESTACION 4:')
print('prediction shape: ', prediction.shape)
print('test shape: ', y_test.shape)
print('MAE :', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)
print('\n')
print('media real: ', mean_real)
print('media predecida: ', mean_prediction)
print('\n')

ESTACION 4:
prediction shape:  (90, 48)
test shape:  (90, 48)
MAE : 7.645258430561718
MAPE:  26.59192908977706
RMSE:  10.002274223742468


media real:  34.826080246913584
media predecida:  33.688305


