In [1]:
# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation


# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

import utils_lr

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.multioutput import MultiOutputRegressor

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

In [2]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates = ['FECHAHORA'])
datos.head()

Unnamed: 0,ESTACION,FECHAHORA,ANHO,DIA,MES,HORA,MINUTO,MP1,MP2_5,MP10,...,DIA_SEM,AQI_MP2_5,AQI_MP10,MEDICION_DIA,MP1_ANTERIOR,MP2_5_ANTERIOR,MP10_ANTERIOR,TEMPERATURA_PRONOSTICO,HUMEDAD_PRONOSTICO,PRESION_PRONOSTICO
0,1,2019-12-30 13:00:00,2019,30,12,13,0,2.839,3.989,4.599,...,0,72.0,25.0,157,2.609,3.72,4.519,36.5,42.8,995.1
1,1,2019-12-30 13:05:00,2019,30,12,13,5,9.0,13.079,21.319,...,0,72.0,25.0,158,1.919,2.659,10.34,36.7,42.1,994.9
2,1,2019-12-30 13:10:00,2019,30,12,13,10,1.969,2.369,13.17,...,0,72.0,25.0,159,1.769,2.609,3.419,36.6,43.5,994.8
3,1,2019-12-30 13:15:00,2019,30,12,13,15,1.74,2.379,2.429,...,0,72.0,25.0,160,1.48,2.009,2.159,37.0,41.4,994.7
4,1,2019-12-30 13:20:00,2019,30,12,13,20,2.71,4.119,7.71,...,0,72.0,25.0,161,1.25,1.909,2.809,37.1,40.6,994.6


# 7 dias antes

In [3]:
estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
             'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
             'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 7 
forecast_days = 1 
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 12)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

X_train, y_train, X_test, y_test = utils_lr.get_everything(datos, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

In [4]:
%%time

lr_model = LinearRegression()

trained_lr = MultiOutputRegressor(lr_model).fit(X_train, y_train)


CPU times: total: 33min 46s
Wall time: 5min 40s


In [6]:
y_pred = trained_lr.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred)
print('MEAN')
print(y_test.mean())
print('MAPE')
print(mape)
print('MAE')
print(mean_absolute_error(y_pred, y_test))
print('RMSE')
print(mean_squared_error(y_pred, y_test, squared= False))

MEAN
23.87969219219219
MAPE
0.7923640708091709
MAE
14.51371586708023
RMSE
18.260068590502726


In [7]:
import pickle

pickle.dump(trained_lr, open('model_lr_7_dias.pkl', 'wb'))

# 1 dia antes

In [11]:
%%time

estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
             'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
             'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 1 
forecast_days = 1 
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 12)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

X_train, y_train, X_test, y_test = utils_lr.get_everything(datos, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

print(X_train.shape)

lr_model_2 = LinearRegression()

trained_lr_2 = MultiOutputRegressor(lr_model_2).fit(X_train, y_train)

y_pred_2 = trained_lr_2.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred_2)
print('MEAN')
print(y_test.mean())
print('MAPE')
print(mape)
print('MAE')
print(mean_absolute_error(y_pred_2, y_test))
print('RMSE')
print(mean_squared_error(y_pred_2, y_test, squared= False))

pickle.dump(trained_lr_2, open('model_lr_1_dia.pkl', 'wb'))

(1457, 305)
MEAN
23.834656084656086
MAPE
0.42102347679246144
MAE
7.4599548839730465
RMSE
9.617395826301415
CPU times: total: 44.5 s
Wall time: 9.81 s


# 15 dias antes

In [14]:
%%time

estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
             'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
             'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 15
forecast_days = 1 
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 12)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

X_train, y_train, X_test, y_test = utils_lr.get_everything(datos, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

print(X_train.shape)

lr_model_3 = LinearRegression()

trained_lr_3 = MultiOutputRegressor(lr_model_3).fit(X_train, y_train)

y_pred_3 = trained_lr_3.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred_3)

print('MEAN')
print(y_test.mean())
print('MAPE')
print(mape)
print('MAE')
print(mean_absolute_error(y_pred_3, y_test))
print('RMSE')
print(mean_squared_error(y_pred_3, y_test, squared= False))

pickle.dump(trained_lr_3, open('model_lr_15_dias.pkl', 'wb'))

(1401, 4337)
MEAN
23.53864433370247
MAPE
0.6206976455258716
MAE
10.327044483587377
RMSE
13.124266519219105
CPU times: total: 28min 57s
Wall time: 5min 18s


# 30 dias antes

In [15]:
%%time

estacion = 4

variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 
             'TEMPERATURA', 'HUMEDAD', 'PRESION', 'TEMPERATURA_PRONOSTICO', 
             'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO', 'DIA_SEM', 'TRAFICO' , 'AQI_MP10', 'AQI_MP2_5']

dependent = ['AQI_MP2_5']

number_of_features = len(variables)

training_days = 30
forecast_days = 1 
samples_per_day = 288
step = 288/4

# Creamos una variable que nos diga con cuantos meses de entrenamiento queremos contar para el X_train
train_months = relativedelta(months = 12)

input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

X_train, y_train, X_test, y_test = utils_lr.get_everything(datos, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

print(X_train.shape)

lr_model_4 = LinearRegression()

trained_lr_4 = MultiOutputRegressor(lr_model_4).fit(X_train, y_train)

y_pred_4 = trained_lr_4.predict(X_test)

mape = mean_absolute_percentage_error(y_test, y_pred_4)

print('MEAN')
print(y_test.mean())
print('MAPE')
print(mape)
print('MAE')
print(mean_absolute_error(y_pred_4, y_test))
print('RMSE')
print(mean_squared_error(y_pred_4, y_test, squared= False))

pickle.dump(trained_lr_4, open('model_lr_30_dias.pkl', 'wb'))

(1341, 8657)
MEAN
21.912157100046105
MAPE
0.6022122450130545
MAE
9.401233737217504
RMSE
11.775676101443658
CPU times: total: 43min 35s
Wall time: 8min 18s
