In [1]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation

# Training utils
import utils_svr

# optuna
import optuna

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

  from pandas import MultiIndex, Int64Index


In [2]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates = ['FECHAHORA'])

In [3]:
estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
             'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
             'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 7 
forecast_days = 1 
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 12)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

In [4]:
datos['AQI_MP2_5'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 1315882 entries, 0 to 1315881
Series name: AQI_MP2_5
Non-Null Count    Dtype  
--------------    -----  
1315882 non-null  float64
dtypes: float64(1)
memory usage: 10.0 MB


In [5]:
%%time

X_train, y_train, X_test, y_test = utils_svr.get_everything(datos, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

CPU times: total: 15.7 s
Wall time: 15.8 s


In [6]:
params = {
    'kernel'    : 'rbf',
    'gamma'     : 'auto',
    'C'         : 0.1,
    'epsilon'   : 1,
    'cache_size': 4000
    }   

optuna_svr_model = SVR(** params)

trained_svr = MultiOutputRegressor(optuna_svr_model).fit(X_train, y_train)



In [7]:
y_pred = trained_svr.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred)

print('MAPE')
print(mape)
print('MAE')
print(mean_absolute_error(y_pred, y_test))
print('RMSE')
print(mean_squared_error(y_pred, y_test, squared= False))

MAPE
0.7045955073827199
MAE
11.454504941648004
RMSE
15.156805696288947


In [8]:
def objective(trial):
    
    params = {
    'kernel'    : 'rbf',
    'gamma'     : trial.suggest_loguniform('gamma', 1e-8, 1.0),
    'C'         : trial.suggest_loguniform('C', 1e-5, 100.0),
    'epsilon'   : 1,
    'cache_size': 4000
    }   

    optuna_svr_model = SVR(** params)

    trained_svr = MultiOutputRegressor(optuna_svr_model).fit(X_train, y_train)

    y_pred = trained_svr.predict(X_test)

    mape = mean_absolute_percentage_error(y_test, y_pred)
    
    print('MAPE')
    print(mape)
    print('MAE')
    print(mean_absolute_error(y_pred, y_test))
    print('RMSE')
    print(mean_squared_error(y_pred, y_test, squared= False))
    
    
    return mape

study = optuna.create_study(direction='minimize')

study.optimize(objective, n_trials = 30)

trial = study.best_trial

import joblib

joblib.dump(study, "study_SVR_e4_MP2.5_correcto_7_days.pkl")

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[32m[I 2023-02-05 15:13:01,371][0m A new study created in memory with name: no-name-722c8244-db08-45e0-992b-5c70354369f6[0m


MAPE
0.6800266775492424
MAE
11.158592580039867
RMSE
14.888419810842898


[32m[I 2023-02-05 15:24:49,980][0m Trial 0 finished with value: 0.6800266775492424 and parameters: {'gamma': 3.0876887454597824e-06, 'C': 0.02966993854745562}. Best is trial 0 with value: 0.6800266775492424.[0m


MAPE
0.4721396841880735
MAE
8.655067712709338
RMSE
12.443032773539796


[32m[I 2023-02-05 15:37:19,753][0m Trial 1 finished with value: 0.4721396841880735 and parameters: {'gamma': 3.345841696928003e-07, 'C': 1.2548791130945498}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.9363316316175658
MAE
13.884133362640133
RMSE
16.715447080030483


[32m[I 2023-02-05 16:44:51,023][0m Trial 2 finished with value: 0.9363316316175658 and parameters: {'gamma': 0.0003295617516395543, 'C': 50.341064615502816}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.7017817641509634
MAE
11.419007028258632
RMSE
15.11871942369433


[32m[I 2023-02-05 16:57:49,276][0m Trial 3 finished with value: 0.7017817641509634 and parameters: {'gamma': 2.7543786511225395e-08, 'C': 0.03420507498961019}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.7022594760055733
MAE
11.433129352826795
RMSE
15.148801432073899


[32m[I 2023-02-05 17:13:57,948][0m Trial 4 finished with value: 0.7022594760055733 and parameters: {'gamma': 0.38416707524472693, 'C': 0.02075402232587441}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.6952098653874214
MAE
11.334705610515465
RMSE
15.032551094965735


[32m[I 2023-02-05 17:26:37,353][0m Trial 5 finished with value: 0.6952098653874214 and parameters: {'gamma': 1.1451615221489398e-07, 'C': 0.032488764782667226}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.7025516261462995
MAE
11.435802644282482
RMSE
15.149791321180443


[32m[I 2023-02-05 17:42:49,707][0m Trial 6 finished with value: 0.7025516261462995 and parameters: {'gamma': 0.03422883633680851, 'C': 0.030665593490975914}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.7142587679593124
MAE
11.501145809005282
RMSE
15.107894757197926


[32m[I 2023-02-05 17:55:17,150][0m Trial 7 finished with value: 0.7142587679593124 and parameters: {'gamma': 9.205635116898263e-06, 'C': 0.039446843313150716}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.7016543207722342
MAE
11.427591898980616
RMSE
15.146760829345165


[32m[I 2023-02-05 18:07:25,971][0m Trial 8 finished with value: 0.7016543207722342 and parameters: {'gamma': 0.00015157150793211705, 'C': 0.0002708626324730408}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.7016499595106881
MAE
11.427552040347924
RMSE
15.146746492138776


[32m[I 2023-02-05 18:20:06,840][0m Trial 9 finished with value: 0.7016499595106881 and parameters: {'gamma': 0.2660574056391291, 'C': 0.00010669861752768448}. Best is trial 1 with value: 0.4721396841880735.[0m


MAPE
0.3989573553152085
MAE
7.387503712631195
RMSE
10.667212065656859


[32m[I 2023-02-05 19:16:04,159][0m Trial 10 finished with value: 0.3989573553152085 and parameters: {'gamma': 7.500986230137021e-07, 'C': 21.928029685823795}. Best is trial 10 with value: 0.3989573553152085.[0m


MAPE
0.39367620141229687
MAE
7.290620331781917
RMSE
10.5017848731427


[32m[I 2023-02-05 20:39:51,942][0m Trial 11 finished with value: 0.39367620141229687 and parameters: {'gamma': 6.339821168872186e-07, 'C': 50.02887579362023}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.5176452302181668
MAE
9.276402276389534
RMSE
13.325637843087936


[32m[I 2023-02-05 23:19:17,264][0m Trial 12 finished with value: 0.5176452302181668 and parameters: {'gamma': 2.105563406665748e-06, 'C': 88.94565893688917}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.7369081435091661
MAE
11.75019520780382
RMSE
15.288296708025282


[32m[I 2023-02-05 23:55:28,354][0m Trial 13 finished with value: 0.7369081435091661 and parameters: {'gamma': 0.001800668091567147, 'C': 5.4323057936766785}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.5737011635134421
MAE
10.075251429894799
RMSE
14.134810104949018


[32m[I 2023-02-06 00:07:40,621][0m Trial 14 finished with value: 0.5737011635134421 and parameters: {'gamma': 1.5577418032354837e-08, 'C': 2.3864166693370668}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.7020001915345174
MAE
11.42268045174488
RMSE
14.845935189439311


[32m[I 2023-02-06 00:49:48,955][0m Trial 15 finished with value: 0.7020001915345174 and parameters: {'gamma': 1.7481800822455516e-05, 'C': 10.449473259678584}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.5184802069359506
MAE
9.354535854372394
RMSE
13.332337829067457


[32m[I 2023-02-06 01:01:43,380][0m Trial 16 finished with value: 0.5184802069359506 and parameters: {'gamma': 5.774975153883573e-07, 'C': 0.41314246459121473}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.8187723885002116
MAE
12.75972911543499
RMSE
15.88082636488462


[32m[I 2023-02-06 01:48:48,060][0m Trial 17 finished with value: 0.8187723885002116 and parameters: {'gamma': 2.784711287563172e-05, 'C': 18.672736329258072}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.7017128832005919
MAE
11.426046815823948
RMSE
15.140653928284982


[32m[I 2023-02-06 02:01:09,458][0m Trial 18 finished with value: 0.7017128832005919 and parameters: {'gamma': 9.763601479801188e-08, 'C': 0.002286312484305708}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.7124555574253573
MAE
11.526427604449623
RMSE
15.18523224697726


[32m[I 2023-02-06 02:20:01,554][0m Trial 19 finished with value: 0.7124555574253573 and parameters: {'gamma': 0.004113679008244478, 'C': 0.36666926421366997}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.514503824904063
MAE
9.244157201888328
RMSE
13.102544951082765


[32m[I 2023-02-06 02:31:59,600][0m Trial 20 finished with value: 0.514503824904063 and parameters: {'gamma': 1.3077904433031761e-06, 'C': 0.5921694843753711}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.45210251279881575
MAE
8.367554935019703
RMSE
11.99245511432489


[32m[I 2023-02-06 02:44:57,500][0m Trial 21 finished with value: 0.45210251279881575 and parameters: {'gamma': 7.146399397447548e-07, 'C': 1.8045842555589255}. Best is trial 11 with value: 0.39367620141229687.[0m


MAPE
0.37773577154871507
MAE
7.158344092650577
RMSE
10.409729181222614


[32m[I 2023-02-06 03:10:23,381][0m Trial 22 finished with value: 0.37773577154871507 and parameters: {'gamma': 1.6819976058393788e-07, 'C': 23.657017475552806}. Best is trial 22 with value: 0.37773577154871507.[0m


MAPE
0.38864831978151937
MAE
7.289168852998484
RMSE
10.653979343196506


[32m[I 2023-02-06 03:30:08,825][0m Trial 23 finished with value: 0.38864831978151937 and parameters: {'gamma': 9.333411532177208e-08, 'C': 23.476814021068023}. Best is trial 22 with value: 0.37773577154871507.[0m


MAPE
0.37306769273602924
MAE
7.048744177840799
RMSE
10.321432643746364


[32m[I 2023-02-06 03:56:30,875][0m Trial 24 finished with value: 0.37306769273602924 and parameters: {'gamma': 8.770724678350123e-08, 'C': 40.8966184876212}. Best is trial 24 with value: 0.37306769273602924.[0m


MAPE
0.526884920744612
MAE
9.445948475363716
RMSE
13.487293661525472


[32m[I 2023-02-06 04:08:25,902][0m Trial 25 finished with value: 0.526884920744612 and parameters: {'gamma': 1.0521649131470732e-08, 'C': 7.952556315763799}. Best is trial 24 with value: 0.37306769273602924.[0m


MAPE
0.45292895454739424
MAE
8.292276186506987
RMSE
12.008145305231599


[32m[I 2023-02-06 04:21:00,556][0m Trial 26 finished with value: 0.45292895454739424 and parameters: {'gamma': 8.959041547040204e-08, 'C': 4.6822663998941785}. Best is trial 24 with value: 0.37306769273602924.[0m


MAPE
0.6502905914145706
MAE
10.881634161088956
RMSE
14.752502046347335


[32m[I 2023-02-06 04:32:57,336][0m Trial 27 finished with value: 0.6502905914145706 and parameters: {'gamma': 5.6119239910923975e-08, 'C': 0.18553015579049437}. Best is trial 24 with value: 0.37306769273602924.[0m


MAPE
0.3588288525566349
MAE
6.845006700409575
RMSE
9.881475249689663


[32m[I 2023-02-06 05:39:56,140][0m Trial 28 finished with value: 0.3588288525566349 and parameters: {'gamma': 1.7498972177030344e-07, 'C': 88.76229687780621}. Best is trial 28 with value: 0.3588288525566349.[0m


MAPE
0.5460966812892696
MAE
9.578190298455317
RMSE
13.460289956188525


[32m[I 2023-02-06 07:36:34,765][0m Trial 29 finished with value: 0.5460966812892696 and parameters: {'gamma': 3.949427233085567e-06, 'C': 59.681821819798884}. Best is trial 28 with value: 0.3588288525566349.[0m


    gamma: 1.7498972177030344e-07
    C: 88.76229687780621
