In [1]:
# Directories
import os

new_directory = r'c://Users//Fer//TESIS_ARCHIVOS//TESIS_AIRE//MP_Forecasting//aqi_forecasting//notebooks'
os.chdir(new_directory)

# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Training utils for SVR
from training_code.utils import utils_xgboost

# Optuna
import optuna
import joblib
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])

In [5]:
study = joblib.load("optuna_studies/XGBOOST/study_XGBOOST_230128_e4_7antes_1prediccion_cada6h.pkl")

params = study.best_params

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
                'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
                'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 15 
forecast_days = 1 
samples_per_day = 288
step = 288/4

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

In [8]:
%%time

predicciones = {}
metricas = {}

for i in range(1, 11):

    estacion = i

    X_val, y_val = utils_xgboost.get_validation(validacion, 
                                            estacion, 
                                            variables, 
                                            dependent, 
                                            train_test_samples, 
                                            input_samples, 
                                            output_samples, 
                                            number_of_features,
                                            step)


    trained_svr_model = joblib.load('models/models_xgboost/15_dias/xgboost_15dias_estacion_'+str(i)+'.pkl')

    # trained_svr_model = MultiOutputRegressor(svr_model).fit(X_train , y_train)

    prediction = trained_svr_model.predict(X_val)
    
    # guardamos los valores predecidos vs reales en un diccionario
    
    predicciones[i] = {'real' : y_val, 'prediccion': prediction}

    # df_predicciones = pd.DataFrame.from_dict(predicciones)

    # df_predicciones.to_csv('predicciones_10estaciones_'+str(training_days)+'dias.csv')

    # guardamos los modelos

    #pickle.dump(trained_svr_model, open('models/models_svr/'+str(training_days)+'_dias/svr_'+str(training_days)+'dias_estacion_'+str(i) + '.pkl', 'wb'))

    mean_real = y_val.mean()
    mean_prediction = prediction.mean()

    MAPE = mean_absolute_percentage_error(prediction, y_val)
    MAE = mean_absolute_error(prediction, y_val)
    RMSE = mean_squared_error(prediction, y_val, squared = False)
    
    # guardamos las metricas en un diccionario
    
    metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

    df_metricas = pd.DataFrame.from_dict(metricas)

    # guardamos las metricas en un csv
    df_metricas.to_csv('metrics/SVR/metricas_10estaciones_'+str(training_days)+'dias.csv')
    
    print('ESTACION '+ str(i) + ':')
    print('prediction shape: ', prediction.shape)
    print('test shape: ', y_val.shape)
    print('MAE :', MAE)
    print('MAPE: ', MAPE)
    print('RMSE: ', RMSE)
    print('\n')
    print('media real: ', mean_real)
    print('media predecida: ', mean_prediction)
    print('\n')

ESTACION 1:
prediction shape:  (305, 288)
test shape:  (305, 288)
MAE : 9.425672606941998
MAPE:  0.2708307468244015
RMSE:  12.377314749491678


media real:  39.07479508196721
media predecida:  38.229546


ESTACION 2:
prediction shape:  (305, 288)
test shape:  (305, 288)
MAE : 7.614708678117649
MAPE:  0.23479939109766745
RMSE:  10.397093812688185


media real:  35.7981102003643
media predecida:  32.155457


ESTACION 3:
prediction shape:  (305, 288)
test shape:  (305, 288)
MAE : 9.326683235885016
MAPE:  0.20266034323034957
RMSE:  13.924338350384291


media real:  48.071140710382515
media predecida:  45.09805


ESTACION 4:
prediction shape:  (305, 288)
test shape:  (305, 288)
MAE : 9.253779352102121
MAPE:  0.27604785148734606
RMSE:  12.750270805181659


media real:  37.49153005464481
media predecida:  34.015545


ESTACION 5:
prediction shape:  (305, 288)
test shape:  (305, 288)
MAE : 8.426608568723085
MAPE:  0.2637769874420535
RMSE:  12.608650235139924


media real:  33.846948998178505
me

In [9]:
for i in range(1,11):

    prediction = predicciones[i]['prediccion']
    y_val = predicciones[i]['real']

    y_val.shape

    pred_1 = []

    for j in range(1, len(prediction), 4):
        pred_1.append(prediction[j])
        
    pred_1 = np.asarray(pred_1)
        
    test_1 = []

    for j in range(1, len(y_val), 4):
        test_1.append(y_val[j])
        
    test_1 = np.asarray(test_1)

    print(pred_1.shape)
    print(test_1.shape)

    y_pred = np.reshape(pred_1, ( len(pred_1) * len(pred_1[0])))
    y_test_1 = np.reshape(test_1, (len(test_1) * len(test_1[0])))

    print(y_pred)
    print(y_test_1)

    fig_val = go.Figure()

    fig_val.add_trace(
        go.Scatter( y = list(y_test_1), name = 'Valor Real'))

    fig_val.add_trace(
        go.Scatter( y = list(y_pred), name = 'Valor Predecido'))

    fig_val.update_layout( title_text = "Validación - Estación " + str(i)+ " - Forecasts vs Targets")

    fig_val.write_html('graphs/XGBOOST/xgboost_estacion_'+str(i)+'_targets_vs_forecasts.html')

(76, 288)
(76, 288)
[31.013838 31.321808 31.330666 ... 32.768585 34.540348 32.0016  ]
[33. 33. 33. ... 25. 25. 25.]
(76, 288)
(76, 288)
[11.022332 11.025125 11.058061 ... 30.912966 29.791367 30.697643]
[12. 12. 12. ... 25. 25. 25.]
(76, 288)
(76, 288)
[23.4666   23.559698 23.425415 ... 29.3059   31.376476 30.557854]
[25. 25. 25. ... 38. 38. 38.]
(76, 288)
(76, 288)
[27.203442 27.1453   27.26753  ... 25.033615 29.000113 27.17623 ]
[29. 29. 29. ... 29. 29. 29.]
(76, 288)
(76, 288)
[15.925361 15.936917 15.792242 ... 24.701653 25.502048 25.36472 ]
[17. 17. 17. ... 21. 21. 21.]
(76, 288)
(76, 288)
[27.200151 27.135536 27.150341 ... 36.91676  36.608337 37.636044]
[29. 29. 29. ... 42. 42. 42.]
(76, 288)
(76, 288)
[15.968189 16.084053 16.212093 ... 25.119694 25.137224 25.934935]
[17. 17. 17. ... 21. 21. 21.]
(76, 288)
(76, 288)
[43.194565 43.36156  43.92866  ... 43.104443 46.21648  45.53411 ]
[46. 46. 46. ... 33. 33. 33.]
(76, 288)
(76, 288)
[ 7.530393   7.5327682  7.5598707 ... 21.000198  22.

In [None]:
dframe = pd.DataFrame()

for i in range(1,11):

    prediction = predicciones[i]['prediccion']
    y_val = predicciones[i]['real']
    

    pred_0 = []
    y_val_0 = []

    for j in range(0, len(prediction), 4):
        pred_0.append(prediction[j])
        y_val_0.append(y_val[j])



    # pred_1 = []
    # y_val_1 = []

    # for j in range(1, len(prediction), 4):
    #     pred_1.append(prediction[j])
    #     y_val_1.append(y_val[j])

    # pred_2 = []
    # y_val_2 = []

    # for j in range(2, len(prediction), 4):
    #     pred_2.append(prediction[j])
    #     y_val_2.append(y_val[j])

    # pred_3 = []
    # y_val_3 = []

    # for j in range(3, len(prediction), 4):
    #     pred_3.append(prediction[j])
    #     y_val_3.append(y_val[j])

    # pred_df = pd.concat()

    