In [1]:
# Directories
import os

new_directory = r'c://Users//Fer//TESIS_ARCHIVOS//TESIS_AIRE//MP_Forecasting//aqi_forecasting//notebooks'
os.chdir(new_directory)

# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation


# Training utils
from training_code.utils import utils_svr

# Optuna
import optuna
import joblib
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates=['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])

In [3]:
lista = [datos, validacion]

df = pd.concat(lista)

df.reset_index(inplace = True, drop = True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1580842 entries, 0 to 1580841
Data columns (total 28 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   ESTACION                1580842 non-null  int64         
 1   FECHAHORA               1580842 non-null  datetime64[ns]
 2   ANHO                    1580842 non-null  int64         
 3   DIA                     1580842 non-null  int64         
 4   MES                     1580842 non-null  int64         
 5   HORA                    1580842 non-null  int64         
 6   MINUTO                  1580842 non-null  int64         
 7   MP1                     1580842 non-null  float64       
 8   MP2_5                   1580842 non-null  float64       
 9   MP10                    1580842 non-null  float64       
 10  TEMPERATURA             1580842 non-null  float64       
 11  HUMEDAD                 1580842 non-null  float64       
 12  PRESION       

# 6 horas - Mejores parámetros

In [8]:
predicciones = {}
metricas = {}

estacion = 4

variables = ['MP1', 'MP2_5','AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 2 
forecast_days = 1/4
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 15)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



X_train, y_train, X_test, y_test = utils_svr.get_everything(df, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)


# study = joblib.load('optuna_studies/SVR/experimento_full/study_SVR_resample_5_horizon_025_previous_2.pkl')

params = {'kernel': 'rbf',
          'gamma': 1.4367019475033274e-06, 
          'C': 98.81176561561226, 
          'epsilon' : 1}

svr_model = SVR(** params)

trained_svr_model = MultiOutputRegressor(svr_model).fit(X_train , y_train)

prediction = trained_svr_model.predict(X_test)

# guardamos los valores predecidos vs reales en un diccionario

predicciones[4] = {'real' : y_test, 'prediccion': prediction}

mean_real = y_test.mean()
mean_prediction = prediction.mean()

MAPE = mean_absolute_percentage_error(prediction, y_test) * 100
MAE = mean_absolute_error(prediction, y_test)
RMSE = mean_squared_error(prediction, y_test, squared = False)

# guardamos las metricas en un diccionario

metricas[4] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/SVR/experimento_full/validation_svr_station4_5_0.25_2.csv')

list_dfs = []


d = {'TARGET': predicciones[4]['real'].flatten(), 'FORECAST': predicciones[4]['prediccion'].flatten()}
df_predicciones = pd.DataFrame(data = d)
df_predicciones['ESTACION'] = 4

df_predicciones.to_csv('datos/experimento_full/predicciones_validation_svr_station4_5_0.25_2.csv')

print('ESTACION 4:')
print('prediction shape: ', prediction.shape)
print('test shape: ', y_test.shape)
print('MAE :', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)
print('\n')
print('media real: ', mean_real)
print('media predecida: ', mean_prediction)
print('\n')


ESTACION 4:
prediction shape:  (360, 72)
test shape:  (360, 72)
MAE : 3.16904402531528
MAPE:  10.644147853373832
RMSE:  5.346249221431659


media real:  34.82507716049383
media predecida:  33.6095889632412




# 12 horas - mejores parametros

In [9]:
predicciones = {}
metricas = {}

estacion = 4

variables = ['MP1', 'MP2_5','AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 2 
forecast_days = 1/2
samples_per_day = 288
step = 288/2

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 15)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



X_train, y_train, X_test, y_test = utils_svr.get_everything(df, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)


# study = joblib.load('optuna_studies/SVR/experimento_full/study_SVR_resample_5_horizon_025_previous_2.pkl')

params = {'kernel': 'rbf',
          'gamma': 1.7008658358882006e-06, 
          'C': 99.57187861209694, 
          'epsilon' : 1}

svr_model = SVR(** params)

trained_svr_model = MultiOutputRegressor(svr_model).fit(X_train , y_train)

prediction = trained_svr_model.predict(X_test)

# guardamos los valores predecidos vs reales en un diccionario

predicciones[4] = {'real' : y_test, 'prediccion': prediction}

mean_real = y_test.mean()
mean_prediction = prediction.mean()

MAPE = mean_absolute_percentage_error(prediction, y_test) * 100
MAE = mean_absolute_error(prediction, y_test)
RMSE = mean_squared_error(prediction, y_test, squared = False)

# guardamos las metricas en un diccionario

metricas[4] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/SVR/experimento_full/validation_svr_station4_5_0.5_2.csv')

list_dfs = []


d = {'TARGET': predicciones[4]['real'].flatten(), 'FORECAST': predicciones[4]['prediccion'].flatten()}
df_predicciones = pd.DataFrame(data = d)
df_predicciones['ESTACION'] = 4

df_predicciones.to_csv('datos/experimento_full/predicciones_validation_svr_station4_5_0.5_2.csv')

print('ESTACION 4:')
print('prediction shape: ', prediction.shape)
print('test shape: ', y_test.shape)
print('MAE :', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)
print('\n')
print('media real: ', mean_real)
print('media predecida: ', mean_prediction)
print('\n')

ESTACION 4:
prediction shape:  (180, 144)
test shape:  (180, 144)
MAE : 5.3626675499828425
MAPE:  17.86087993472039
RMSE:  8.900464092097671


media real:  34.82507716049383
media predecida:  32.43849528785123




# 24 horas - mejores parametros

In [10]:
predicciones = {}
metricas = {}

estacion = 4

variables = ['MP1', 'MP2_5','AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 2 
forecast_days = 1
samples_per_day = 288
step = 288

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 15)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



X_train, y_train, X_test, y_test = utils_svr.get_everything(df, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)


# study = joblib.load('optuna_studies/SVR/experimento_full/study_SVR_resample_5_horizon_025_previous_2.pkl')

params = {'kernel': 'rbf',
          'gamma': 3.177013786723415e-07, 
          'C': 98.70644069617066, 
          'epsilon' : 1}

svr_model = SVR(** params)

trained_svr_model = MultiOutputRegressor(svr_model).fit(X_train , y_train)

prediction = trained_svr_model.predict(X_test)

# guardamos los valores predecidos vs reales en un diccionario

predicciones[4] = {'real' : y_test, 'prediccion': prediction}

mean_real = y_test.mean()
mean_prediction = prediction.mean()

MAPE = mean_absolute_percentage_error(prediction, y_test) * 100
MAE = mean_absolute_error(prediction, y_test)
RMSE = mean_squared_error(prediction, y_test, squared = False)

# guardamos las metricas en un diccionario

metricas[4] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

df_metricas = pd.DataFrame.from_dict(metricas)

df_metricas.to_csv('metrics/SVR/experimento_full/validation_svr_station4_5_1_2.csv')

list_dfs = []


d = {'TARGET': predicciones[4]['real'].flatten(), 'FORECAST': predicciones[4]['prediccion'].flatten()}
df_predicciones = pd.DataFrame(data = d)
df_predicciones['ESTACION'] = 4

df_predicciones.to_csv('datos/experimento_full/predicciones_validation_svr_station4_5_1_2.csv')

print('ESTACION 4:')
print('prediction shape: ', prediction.shape)
print('test shape: ', y_test.shape)
print('MAE :', MAE)
print('MAPE: ', MAPE)
print('RMSE: ', RMSE)
print('\n')
print('media real: ', mean_real)
print('media predecida: ', mean_prediction)
print('\n')

ESTACION 4:
prediction shape:  (90, 288)
test shape:  (90, 288)
MAE : 8.935468232391887
MAPE:  31.864538685678962
RMSE:  12.788091813397951


media real:  34.82507716049383
media predecida:  31.0094811218111


