In [2]:
# Directories
import os

new_directory = r'c://Users//Fer//TESIS_ARCHIVOS//TESIS_AIRE//MP_Forecasting//aqi_forecasting//notebooks'
os.chdir(new_directory)

# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation


# Training utils
from training_code.XGBOOST import utils_xgboost

# Optuna
import optuna

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


# Cargamos los datos

In [4]:
datos = pd.read_csv('datos/230325_train_resampled_1h.csv', parse_dates = ['FECHAHORA'])

In [11]:
datos.FECHAHORA.max()

Timestamp('2019-03-31 09:00:00')

# Variables de entrenamiento

In [6]:
estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 'AQI_MP10', 'AQI_MP2_5', 'TIPO', 'TRAFICO', 'HUMEDAD', 
             'PRESION', 'TEMPERATURA','DIA_SEM']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 7 
forecast_days = 1
samples_per_day = 24
step = 24

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 12)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test



# Procesamiento previo a entrenamiento

In [7]:
%%time

X_train, y_train, X_test, y_test = utils_xgboost.get_everything(datos, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

CPU times: total: 500 ms
Wall time: 473 ms


In [8]:
print(X_train.shape)
print(y_train.shape)

print(X_test.shape)
print(y_test.shape)


(359, 183)
(359, 24)
(84, 183)
(84, 24)


# Prueba antes de usar optuna

In [8]:
%%time

params = {'learning_rate' : 0.025,
          'n_estimators' : 250,
          'max_depth': 2,
          'min_child_weight' : 1,
          'gamma': 0.0,
          'subsample': 0.98,
          'colsample_bytree': 0.98,
          'scale_pos_weight': 0.8,
          'seed': 42,
          'verbosity' : 0}


xgb_model = xgb.XGBRegressor(** params)

trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

prediction = trained_xgb_model.predict(X_test)
print('prediction', prediction.shape)
print('test', y_test.shape)

MSE = np.mean((prediction - y_test)**2)
print('RMSE: ', MSE**0.5)
MAE = np.mean(np.abs(prediction-y_test))
print('MAE: ', MAE)

prediction (333, 288)
test (333, 288)
RMSE:  9.781115562652332
MAE:  6.694983114898304
CPU times: total: 1h 56min 35s
Wall time: 10min 16s


In [None]:
%%time

def objective(trial):
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 1, 9),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 10, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-5, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
    }
    
    optuna_xgb_model = xgb.XGBRegressor(** params)
    
    trained_xgb_model = MultiOutputRegressor(optuna_xgb_model).fit(X_train , y_train)

    prediction = trained_xgb_model.predict(X_test)

    MAPE = mean_absolute_percentage_error(prediction, y_test)
    print('MAPE: ', MAPE)
    MAE = np.mean(np.abs(prediction - y_test))
    print('MAE: ', MAE)

    MSE = mean_squared_error(prediction, y_test)
    
    return MSE

study = optuna.create_study(direction='minimize')

study.optimize(objective, timeout= 3600 )

trial = study.best_trial

In [15]:
import joblib

joblib.dump(study, "optuna_studies/study_XGBOOST_230311_RESAMPLE_e4_7antes_1prediccion_cada24h_MSE.pkl")
for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

    max_depth: 1
    learning_rate: 0.14282255288480264
    n_estimators: 119
    min_child_weight: 10
    gamma: 4.6765240256450094e-05
    subsample: 0.7918596491329176
    colsample_bytree: 0.995987995143913


In [1]:
study.best_trial

NameError: name 'study' is not defined