In [17]:
# Directories
import os

new_directory = r'c://Users//Fer//TESIS_ARCHIVOS//TESIS_AIRE//MP_Forecasting//aqi_forecasting//notebooks'
os.chdir(new_directory)

# Data Manipulation
import pandas as pd # for data manipulation
import numpy as np # for data manipulation


# Training utils
from training_code.utils import utils_xgboost

# Optuna
import optuna
import joblib
import pickle

# Tiempo
import datetime as dt
from dateutil.relativedelta import relativedelta, MO

# Modelos
from sklearn.linear_model import LinearRegression # for building a linear regression model
from sklearn.svm import SVR # for building SVR model
from sklearn.preprocessing import MinMaxScaler
from sklearn.multioutput import MultiOutputRegressor
import xgboost as xgb
from sklearn.model_selection import train_test_split

# Metricas
from sklearn.metrics import mean_absolute_error #MAE
from sklearn.metrics import mean_absolute_percentage_error #MAPE
from sklearn.metrics import mean_squared_error #MSE, para RMSE: squared = False

# Visualizations
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization
import matplotlib.pyplot as plt

# Advertencias
import warnings
warnings.filterwarnings("ignore")

In [13]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates=['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])

In [49]:
horizon = [0.25, 0.5, 1]
resample = [5, 30, 60]
previous = [2, 7, 15]

# Start resampling

for resample_item in resample:

    df_test = validacion.copy()
    df_train = datos.copy()

    df_test.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
                'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
                'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

    df_train.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
                'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
                'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

    df_train = df_train.set_index('FECHAHORA', drop = True)
    df_test = df_test.set_index('FECHAHORA', drop = True)

    lista_resample = []

    if resample_item != 5: 

        r_i = str(resample_item) + 'T'

        for station in range(1,11):
            df_aux = df_train[df_train['ESTACION'] == station]

            df_aux = df_aux.resample(r_i).mean()
            lista_resample.append(df_aux)

        df_train = pd.concat(lista_resample)

        lista_resample = []

        for station in range(1,11):
            df_aux = df_test[df_test['ESTACION'] == station]

            df_aux = df_aux.resample(r_i).mean()
            lista_resample.append(df_aux)

        df_test = pd.concat(lista_resample)

    df_test['ANHO'] = df_test.index.year
    df_test['MES'] = df_test.index.month
    df_test['DIA'] = df_test.index.day
    df_test['HORA'] = df_test.index.hour
    df_test['MINUTO'] = df_test.index.minute
    df_test['FECHAHORA'] = df_test.index
    df_test.reset_index(inplace = True, drop = True)

    df_train['ANHO'] = df_train.index.year
    df_train['MES'] = df_train.index.month
    df_train['HORA'] = df_train.index.hour
    df_train['MINUTO'] = df_train.index.minute
    df_train['DIA'] = df_train.index.day
    df_train['FECHAHORA'] = df_train.index
    df_train.reset_index(inplace = True, drop = True)

    df_test.to_csv('datos/experimento_full/test_resample_'+str(resample_item)+'_minutes.csv')
    df_train.to_csv('datos/experimento_full/train_resample_'+str(resample_item)+'_minutes.csv')

    # OPTUNA

    for previous_item in previous:
        for horizon_item in horizon:
            
            estacion = 4

            variables = ["ANHO", 'DIA', 'MES', 'HORA', 'MINUTO', 'MP1', 'MP2_5', 'MP10', 'AQI_MP10', 'AQI_MP2_5', 'TIPO', 'TRAFICO', 'HUMEDAD', 
                        'PRESION', 'TEMPERATURA','DIA_SEM']

            dependent = ['AQI_MP2_5']

            number_of_features = len(variables)

            training_days = previous_item
            forecast_days = horizon_item

            if resample_item == 5:
                samples_per_day = 288
            elif resample_item == 30:
                samples_per_day = 48
            elif resample_item == 60:
                samples_per_day = 24

            step = forecast_days*samples_per_day

            train_months = relativedelta(months = 12)

            input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
            output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
            train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

            X_train, y_train, X_test, y_test = utils_xgboost.get_everything(df_train, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

            def objective(trial):
    
                params = {
                    'max_depth': trial.suggest_int('max_depth', 1, 9),
                    'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
                    'n_estimators': trial.suggest_int('n_estimators', 10, 300),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                    'gamma': trial.suggest_loguniform('gamma', 1e-5, 1.0),
                    'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
                    'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
                }
                
                optuna_xgb_model = xgb.XGBRegressor(** params)
                
                trained_xgb_model = MultiOutputRegressor(optuna_xgb_model).fit(X_train , y_train)

                prediction = trained_xgb_model.predict(X_test)

                MAPE = mean_absolute_percentage_error(prediction, y_test)
                print('MAPE: ', MAPE)
                MAE = np.mean(np.abs(prediction - y_test))
                print('MAE: ', MAE)

                MSE = mean_squared_error(prediction, y_test)
                
                return MSE

            study = optuna.create_study(direction='minimize')

            study.optimize(objective, n_trials = 100 )

            trial = study.best_trial

            joblib.dump(study, "optuna_studies/XGBOOST/experimento_full/study_XGBOOST_resample_"+str(resample_item)+"_horizon_"+str(horizon_item)+"_previous_"+str(previous_item)+".pkl")

            # VALIDATION

            study = joblib.load("optuna_studies/XGBOOST/experimento_full/study_XGBOOST_resample_"+str(resample_item)+"_horizon_"+str(horizon_item)+"_previous_"+str(previous_item)+".pkl")
            params = study.best_params

            predicciones = {}
            metricas = {}
            
            for i in range(1,11):

                estacion = i

                X_train_val, y_train_val = utils_xgboost.get_validation(df_test, 
                                                                        estacion,
                                                                        variables, 
                                                                        dependent, 
                                                                        train_test_samples, 
                                                                        input_samples, 
                                                                        output_samples, 
                                                                        number_of_features,
                                                                        step)
                xgb_model = xgb.XGBRegressor(** params)

                trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

                prediction = trained_xgb_model.predict(X_train_val)
                
                # guardamos los valores predecidos vs reales en un diccionario
                
                predicciones[i] = {'real' : y_train_val, 'prediccion': prediction}


                pickle.dump(trained_xgb_model, open('models/models_xgboost/experimento_full/xgboost_estacion_' + str(i) + '_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.pkl', 'wb'))

                mean_real = y_train_val.mean()
                mean_prediction = prediction.mean()

                MAPE = mean_absolute_percentage_error(prediction, y_train_val)
                MAE = mean_absolute_error(prediction, y_train_val)
                RMSE = mean_squared_error(prediction, y_train_val, squared = False)
                
                # guardamos las metricas en un diccionario
                
                metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

            df_metricas = pd.DataFrame.from_dict(metricas)

            df_metricas.to_csv('metrics/XGBOOST/experimento_full/xgboost_experimento_full_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.csv')

            list_dfs = []

            for station in range(1,11):
                d = {'TARGET': predicciones[station]['real'].flatten(), 'FORECAST': predicciones[station]['prediccion'].flatten()}
                df_aux = pd.DataFrame(data = d)
                df_aux['ESTACION'] = station
                list_dfs.append(df_aux)

            df_predicciones = pd.concat(list_dfs)

            df_predicciones.to_csv('datos/experimento_full/predicciones_10estaciones_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.csv')

            print('study_'+ str(resample_item)+'_'+str(horizon_item)+'_'+ str(previous_item)+' done!')

study_5_0.25_2 done!
study_5_0.5_2 done!
study_5_1_2 done!
study_5_0.25_7 done!
study_5_0.5_7 done!
study_5_1_7 done!
study_5_0.25_15 done!
study_5_0.5_15 done!
study_5_1_15 done!
study_30_0.25_2 done!
study_30_0.5_2 done!
study_30_1_2 done!
study_30_0.25_7 done!
study_30_0.5_7 done!
study_30_1_7 done!
study_30_0.25_15 done!
study_30_0.5_15 done!
study_30_1_15 done!
study_60_0.25_2 done!
study_60_0.5_2 done!
study_60_1_2 done!
study_60_0.25_7 done!
study_60_0.5_7 done!
study_60_1_7 done!
study_60_0.25_15 done!
study_60_0.5_15 done!
study_60_1_15 done!


# GRÁFICAS - 6 HORAS DE PREDICCIÓN - MEJORES RESULTADOS

* Resample = 5 minutos
* Mirando atras 2, 7 y 15 dias

In [95]:
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])
dias_2 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.5_previous_2.csv')
dias_7 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.5_previous_7.csv')
dias_15 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.5_previous_15.csv')

In [66]:
#validacion.info() # 264960
#dias_2.info() # 259200
#dias_7.info() # 244800
dias_15.info() # 221760

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221760 entries, 0 to 221759
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Unnamed: 0  221760 non-null  int64  
 1   TARGET      221760 non-null  float64
 2   FORECAST    221760 non-null  float64
 3   ESTACION    221760 non-null  int64  
dtypes: float64(2), int64(2)
memory usage: 6.8 MB


In [125]:
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])
dias_2 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.5_previous_2.csv')
dias_7 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.5_previous_7.csv')
dias_15 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.5_previous_15.csv')

dias_2.rename(columns={'FORECAST': 'FORECAST_2_ATRAS'}, inplace = True)
dias_7.rename(columns={'FORECAST': 'FORECAST_7_ATRAS'}, inplace = True)
dias_15.rename(columns={'FORECAST': 'FORECAST_15_ATRAS'}, inplace = True)

lista_2 = []

for i in range(1,11):
    df_aux = dias_2[dias_2['ESTACION'] == i]

    df_aux.reset_index(inplace = True)

    df_aux['indice'] = df_aux.index

    #print(df_aux.info())

    df_aux_2 = df_aux[df_aux['indice'] > 3743]

    #print(df_aux_2.info())

    lista_2.append(df_aux_2)

dias_2_c = pd.concat(lista_2)
dias_2_c.reset_index(inplace = True)

print(dias_2_c.info())

lista_7 = []

for i in range(1,11):
    df_aux = dias_7[dias_7['ESTACION'] == i]
    
    df_aux.reset_index(inplace = True)

    df_aux['indice'] = df_aux.index

    df_aux_7 = df_aux[df_aux['indice'] > 2303]

    lista_7.append(df_aux_7)

dias_7_c = pd.concat(lista_7)
dias_7_c.reset_index(inplace = True)



dias_15.reset_index(inplace = True)

lista = []

for i in range(1,11):
    df_aux = validacion[validacion['ESTACION'] == i]

    first_pred = df_aux.FECHAHORA.min() + relativedelta(days=15)

    df_aux = df_aux[df_aux['FECHAHORA'] >= first_pred]

    lista.append(df_aux)

df = pd.concat(lista)

df.reset_index(inplace=True, drop = True)

print(df.info())

df_2_c = dias_2_c.FORECAST_2_ATRAS
df_7_c = dias_7_c.FORECAST_7_ATRAS
df_15 = dias_15.FORECAST_15_ATRAS

df = pd.merge(df_2_c, df, right_index = True, left_index=True)
df = pd.merge(df_7_c, df, right_index = True,  left_index=True)
df = pd.merge(df_15, df, right_index=True, left_index=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221760 entries, 0 to 221759
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   level_0           221760 non-null  int64  
 1   index             221760 non-null  int64  
 2   Unnamed: 0        221760 non-null  int64  
 3   TARGET            221760 non-null  float64
 4   FORECAST_2_ATRAS  221760 non-null  float64
 5   ESTACION          221760 non-null  int64  
 6   indice            221760 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 11.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221760 entries, 0 to 221759
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   ESTACION                221760 non-null  int64         
 1   FECHAHORA               221760 non-null  datetime64[ns]
 2   ANHO                    221760 non-null  i

Unnamed: 0,FORECAST_15_ATRAS,FORECAST_7_ATRAS,FORECAST_2_ATRAS,ESTACION,FECHAHORA,ANHO,DIA,MES,HORA,MINUTO,...,DIA_SEM,AQI_MP2_5,AQI_MP10,MEDICION_DIA,MP1_ANTERIOR,MP2_5_ANTERIOR,MP10_ANTERIOR,TEMPERATURA_PRONOSTICO,HUMEDAD_PRONOSTICO,PRESION_PRONOSTICO
0,24.916775,24.746069,23.98048,1,2021-04-14 13:05:00,2021,14,4,13,5,...,2,25.0,7.0,158,1.449,2.039,2.269,30.7,83.9,997.8
1,24.855766,24.773897,24.085901,1,2021-04-14 13:10:00,2021,14,4,13,10,...,2,25.0,7.0,159,1.549,2.089,2.46,30.4,84.5,997.8
2,24.655935,24.527283,23.993942,1,2021-04-14 13:15:00,2021,14,4,13,15,...,2,25.0,7.0,160,1.379,2.089,3.259,30.4,85.7,997.6
3,24.762094,24.89628,23.874578,1,2021-04-14 13:20:00,2021,14,4,13,20,...,2,29.0,7.0,161,1.309,1.849,12.979,30.2,85.9,997.6
4,24.627117,24.736176,24.159529,1,2021-04-14 13:25:00,2021,14,4,13,25,...,2,29.0,7.0,162,1.079,1.529,1.63,30.6,83.8,997.6


In [129]:
for i in range(1,11):

    fig_val = go.Figure()

    df_grafica = df[df['ESTACION'] == i]

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.AQI_MP2_5), x = list(df_grafica.FECHAHORA), name = 'AQI Real'))
    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_2_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 2 días atrás'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_7_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 7 días atrás'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_15_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 15 días atrás'))

    layout = go.Layout(
        title="Title",
        xaxis=dict(
            title="X Label"
        ),
        yaxis=dict(
            title="Y label"
        ) ) 

    fig_val.update_layout( 
                            title={
                                'text': "Estación " + str(i) + ". AQI real vs Predicciones a 6 horas",
                                'y':0.9,
                                'x':0.5,
                                'xanchor': 'center',
                                'yanchor': 'top'}, 
                        yaxis_title = 'AQI MP2.5', 
                        xaxis_title = 'Fecha',
                        legend_title = 'Referencias',
                            font=dict(
                                    size=16,
                                        ),
                        )

    fig_val.write_html('graphs/XGBOOST/experimento_full/estacion_'+str(i)+'_AQI_vs_prediccion_6hs_2dias_7dias_15dias_resample5min.html')

# GRAFICAS - 12 HORAS DE PREDICCION - MEJORES RESULTADOS

In [130]:
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])
dias_2 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.25_previous_2.csv')
dias_7 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.25_previous_7.csv')
dias_15 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_5_horizon_0.25_previous_15.csv')

dias_2.rename(columns={'FORECAST': 'FORECAST_2_ATRAS'}, inplace = True)
dias_7.rename(columns={'FORECAST': 'FORECAST_7_ATRAS'}, inplace = True)
dias_15.rename(columns={'FORECAST': 'FORECAST_15_ATRAS'}, inplace = True)

lista_2 = []

for i in range(1,11):
    df_aux = dias_2[dias_2['ESTACION'] == i]

    df_aux.reset_index(inplace = True)

    df_aux['indice'] = df_aux.index

    #print(df_aux.info())

    df_aux_2 = df_aux[df_aux['indice'] > 3743]

    #print(df_aux_2.info())

    lista_2.append(df_aux_2)

dias_2_c = pd.concat(lista_2)
dias_2_c.reset_index(inplace = True)

print(dias_2_c.info())

lista_7 = []

for i in range(1,11):
    df_aux = dias_7[dias_7['ESTACION'] == i]
    
    df_aux.reset_index(inplace = True)

    df_aux['indice'] = df_aux.index

    df_aux_7 = df_aux[df_aux['indice'] > 2303]

    lista_7.append(df_aux_7)

dias_7_c = pd.concat(lista_7)
dias_7_c.reset_index(inplace = True)



dias_15.reset_index(inplace = True)

lista = []

for i in range(1,11):
    df_aux = validacion[validacion['ESTACION'] == i]

    first_pred = df_aux.FECHAHORA.min() + relativedelta(days=15)

    df_aux = df_aux[df_aux['FECHAHORA'] >= first_pred]

    lista.append(df_aux)

df = pd.concat(lista)

df.reset_index(inplace=True, drop = True)

print(df.info())

df_2_c = dias_2_c.FORECAST_2_ATRAS
df_7_c = dias_7_c.FORECAST_7_ATRAS
df_15 = dias_15.FORECAST_15_ATRAS

df = pd.merge(df_2_c, df, right_index = True, left_index=True)
df = pd.merge(df_7_c, df, right_index = True,  left_index=True)
df = pd.merge(df_15, df, right_index=True, left_index=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221760 entries, 0 to 221759
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   level_0           221760 non-null  int64  
 1   index             221760 non-null  int64  
 2   Unnamed: 0        221760 non-null  int64  
 3   TARGET            221760 non-null  float64
 4   FORECAST_2_ATRAS  221760 non-null  float64
 5   ESTACION          221760 non-null  int64  
 6   indice            221760 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 11.8 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 221760 entries, 0 to 221759
Data columns (total 28 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   ESTACION                221760 non-null  int64         
 1   FECHAHORA               221760 non-null  datetime64[ns]
 2   ANHO                    221760 non-null  i

Unnamed: 0,FORECAST_15_ATRAS,FORECAST_7_ATRAS,FORECAST_2_ATRAS,ESTACION,FECHAHORA,ANHO,DIA,MES,HORA,MINUTO,...,DIA_SEM,AQI_MP2_5,AQI_MP10,MEDICION_DIA,MP1_ANTERIOR,MP2_5_ANTERIOR,MP10_ANTERIOR,TEMPERATURA_PRONOSTICO,HUMEDAD_PRONOSTICO,PRESION_PRONOSTICO
0,25.10482,25.031105,24.801626,1,2021-04-14 13:05:00,2021,14,4,13,5,...,2,25.0,7.0,158,1.449,2.039,2.269,30.7,83.9,997.8
1,24.99731,24.93653,24.801247,1,2021-04-14 13:10:00,2021,14,4,13,10,...,2,25.0,7.0,159,1.549,2.089,2.46,30.4,84.5,997.8
2,24.977419,24.728025,24.71863,1,2021-04-14 13:15:00,2021,14,4,13,15,...,2,25.0,7.0,160,1.379,2.089,3.259,30.4,85.7,997.6
3,24.91967,25.32549,24.706661,1,2021-04-14 13:20:00,2021,14,4,13,20,...,2,29.0,7.0,161,1.309,1.849,12.979,30.2,85.9,997.6
4,24.971413,25.10544,24.691515,1,2021-04-14 13:25:00,2021,14,4,13,25,...,2,29.0,7.0,162,1.079,1.529,1.63,30.6,83.8,997.6


In [132]:
for i in range(1,11):

    fig_val = go.Figure()

    df_grafica = df[df['ESTACION'] == i]

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.AQI_MP2_5), x = list(df_grafica.FECHAHORA), name = 'AQI Real'))
    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_2_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 2 días atrás'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_7_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 7 días atrás'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_15_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 15 días atrás'))

    layout = go.Layout(
        title="Title",
        xaxis=dict(
            title="X Label"
        ),
        yaxis=dict(
            title="Y label"
        ) ) 

    fig_val.update_layout( 
                            title={
                                'text': "Estación " + str(i) + ". AQI real vs Predicciones a 12 horas",
                                'y':0.9,
                                'x':0.5,
                                'xanchor': 'center',
                                'yanchor': 'top'}, 
                        yaxis_title = 'AQI MP2.5', 
                        xaxis_title = 'Fecha',
                        legend_title = 'Referencias',
                            font=dict(
                                    size=16,
                                        ),
                        )

    fig_val.write_html('graphs/XGBOOST/experimento_full/12hsestacion_'+str(i)+'_AQI_vs_prediccion_12hs_2dias_7dias_15dias_resample5min.html')

# GRAFICAS - 24 HORAS DE PREDICCION - MEJORES RESULTADOS

In [138]:
validacion = pd.read_csv('datos/experimento_full/test_resample_30_minutes.csv', parse_dates=['FECHAHORA'])
dias_2 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_30_horizon_0.25_previous_2.csv')
dias_7 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_30_horizon_0.25_previous_7.csv')
dias_15 = pd.read_csv('datos/experimento_full/predicciones_10estaciones_resample_30_horizon_0.25_previous_15.csv')

dias_2.rename(columns={'FORECAST': 'FORECAST_2_ATRAS'}, inplace = True)
dias_7.rename(columns={'FORECAST': 'FORECAST_7_ATRAS'}, inplace = True)
dias_15.rename(columns={'FORECAST': 'FORECAST_15_ATRAS'}, inplace = True)

lista_2 = []

for i in range(1,11):
    df_aux = dias_2[dias_2['ESTACION'] == i]

    df_aux.reset_index(inplace = True)

    df_aux['indice'] = df_aux.index

    #print(df_aux.info())

    df_aux_2 = df_aux[df_aux['indice'] > 623]

    #print(df_aux_2.info())

    lista_2.append(df_aux_2)

dias_2_c = pd.concat(lista_2)
dias_2_c.reset_index(inplace = True)

print(dias_2_c.info())

lista_7 = []

for i in range(1,11):
    df_aux = dias_7[dias_7['ESTACION'] == i]
    
    df_aux.reset_index(inplace = True)

    df_aux['indice'] = df_aux.index

    df_aux_7 = df_aux[df_aux['indice'] > 383]

    lista_7.append(df_aux_7)

dias_7_c = pd.concat(lista_7)
dias_7_c.reset_index(inplace = True)



dias_15.reset_index(inplace = True)

lista = []

for i in range(1,11):
    df_aux = validacion[validacion['ESTACION'] == i]

    first_pred = df_aux.FECHAHORA.min() + relativedelta(days=15)

    df_aux = df_aux[df_aux['FECHAHORA'] >= first_pred]

    lista.append(df_aux)

df = pd.concat(lista)

df.reset_index(inplace=True, drop = True)

print(df.info())

df_2_c = dias_2_c.FORECAST_2_ATRAS
df_7_c = dias_7_c.FORECAST_7_ATRAS
df_15 = dias_15.FORECAST_15_ATRAS

df = pd.merge(df_2_c, df, right_index = True, left_index=True)
df = pd.merge(df_7_c, df, right_index = True,  left_index=True)
df = pd.merge(df_15, df, right_index=True, left_index=True)
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36960 entries, 0 to 36959
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   level_0           36960 non-null  int64  
 1   index             36960 non-null  int64  
 2   Unnamed: 0        36960 non-null  int64  
 3   TARGET            36960 non-null  float64
 4   FORECAST_2_ATRAS  36960 non-null  float64
 5   ESTACION          36960 non-null  int64  
 6   indice            36960 non-null  int64  
dtypes: float64(2), int64(5)
memory usage: 2.0 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36970 entries, 0 to 36969
Data columns (total 19 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   Unnamed: 0   36970 non-null  int64         
 1   ESTACION     36970 non-null  float64       
 2   MP1          36970 non-null  float64       
 3   MP2_5        36970 non-null  float64       
 4   MP10  

Unnamed: 0.1,FORECAST_15_ATRAS,FORECAST_7_ATRAS,FORECAST_2_ATRAS,Unnamed: 0,ESTACION,MP1,MP2_5,MP10,TEMPERATURA,HUMEDAD,...,TIPO,DIA_SEM,AQI_MP2_5,AQI_MP10,ANHO,MES,DIA,HORA,MINUTO,FECHAHORA
0,24.643347,24.376911,24.596104,720,1.0,2.069333,2.705667,4.812667,32.35,56.883333,...,1.0,2.0,26.333333,7.0,2021,4,14,13,0,2021-04-14 13:00:00
1,24.501448,24.314358,24.0744,721,1.0,1.585833,2.039167,2.596167,32.633333,56.4,...,1.0,2.0,29.0,7.0,2021,4,14,13,30,2021-04-14 13:30:00
2,24.355524,24.47795,24.482935,722,1.0,1.311,1.7375,2.610833,32.4,57.066667,...,1.0,2.0,29.0,7.0,2021,4,14,14,0,2021-04-14 14:00:00
3,24.827559,24.253315,24.16908,723,1.0,1.909167,2.584167,5.390833,32.583333,56.983333,...,1.0,2.0,29.0,7.0,2021,4,14,14,30,2021-04-14 14:30:00
4,23.699274,23.802975,24.65158,724,1.0,2.277833,2.754333,3.699167,31.633333,60.533333,...,1.0,2.0,29.0,7.0,2021,4,14,15,0,2021-04-14 15:00:00


In [139]:
for i in range(1,11):

    fig_val = go.Figure()

    df_grafica = df[df['ESTACION'] == i]

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.AQI_MP2_5), x = list(df_grafica.FECHAHORA), name = 'AQI Real'))
    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_2_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 2 días atrás'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_7_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 7 días atrás'))

    fig_val.add_trace(
        go.Scatter( y = list(df_grafica.FORECAST_15_ATRAS), x = list(df_grafica.FECHAHORA), name = 'Predicción AQI - 15 días atrás'))

    layout = go.Layout(
        title="Title",
        xaxis=dict(
            title="X Label"
        ),
        yaxis=dict(
            title="Y label"
        ) ) 

    fig_val.update_layout( 
                            title={
                                'text': "Estación " + str(i) + ". AQI real vs Predicciones a 12 horas",
                                'y':0.9,
                                'x':0.5,
                                'xanchor': 'center',
                                'yanchor': 'top'}, 
                        yaxis_title = 'AQI MP2.5', 
                        xaxis_title = 'Fecha',
                        legend_title = 'Referencias',
                            font=dict(
                                    size=16,
                                        ),
                        )

    fig_val.write_html('graphs/XGBOOST/experimento_full/24hsestacion_'+str(i)+'_AQI_vs_prediccion_24hs_2dias_7dias_15dias_resample30min.html')

# DIFERENTES VARIABLES - Correlación mayor al 10%

In [None]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates=['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])

horizon = [0.25, 0.5, 1]
resample = [5, 30, 60]
previous = [2, 7, 15]

# Start resampling

for resample_item in resample:

    df_test = validacion.copy()
    df_train = datos.copy()

    df_test.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
                'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
                'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

    df_train.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
                'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
                'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

    df_train = df_train.set_index('FECHAHORA', drop = True)
    df_test = df_test.set_index('FECHAHORA', drop = True)

    lista_resample = []

    if resample_item != 5: 

        r_i = str(resample_item) + 'T'

        for station in range(1,11):
            df_aux = df_train[df_train['ESTACION'] == station]

            df_aux = df_aux.resample(r_i).mean()
            lista_resample.append(df_aux)

        df_train = pd.concat(lista_resample)

        lista_resample = []

        for station in range(1,11):
            df_aux = df_test[df_test['ESTACION'] == station]

            df_aux = df_aux.resample(r_i).mean()
            lista_resample.append(df_aux)

        df_test = pd.concat(lista_resample)

    df_test['ANHO'] = df_test.index.year
    df_test['MES'] = df_test.index.month
    df_test['DIA'] = df_test.index.day
    df_test['HORA'] = df_test.index.hour
    df_test['MINUTO'] = df_test.index.minute
    df_test['FECHAHORA'] = df_test.index
    df_test.reset_index(inplace = True, drop = True)

    df_train['ANHO'] = df_train.index.year
    df_train['MES'] = df_train.index.month
    df_train['HORA'] = df_train.index.hour
    df_train['MINUTO'] = df_train.index.minute
    df_train['DIA'] = df_train.index.day
    df_train['FECHAHORA'] = df_train.index
    df_train.reset_index(inplace = True, drop = True)

    df_test.to_csv('datos/experimento_full/test_resample_'+str(resample_item)+'_minutes.csv')
    df_train.to_csv('datos/experimento_full/train_resample_'+str(resample_item)+'_minutes.csv')

    # OPTUNA

    for previous_item in previous:
        for horizon_item in horizon:
            
            estacion = 4

            variables = ['MES', 'MP1', 'MP2_5', 'MP10', 'AQI_MP10', 'AQI_MP2_5', 'HUMEDAD', 
                        'PRESION', 'TEMPERATURA','DIA_SEM']

            dependent = ['AQI_MP2_5']

            number_of_features = len(variables)

            training_days = previous_item
            forecast_days = horizon_item

            if resample_item == 5:
                samples_per_day = 288
            elif resample_item == 30:
                samples_per_day = 48
            elif resample_item == 60:
                samples_per_day = 24

            step = forecast_days*samples_per_day

            train_months = relativedelta(months = 12)

            input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
            output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
            train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

            X_train, y_train, X_test, y_test = utils_xgboost.get_everything(df_train, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

            def objective(trial):
    
                params = {
                    'max_depth': trial.suggest_int('max_depth', 1, 9),
                    'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
                    'n_estimators': trial.suggest_int('n_estimators', 10, 300),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                    'gamma': trial.suggest_loguniform('gamma', 1e-5, 1.0),
                    'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
                    'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
                }
                
                optuna_xgb_model = xgb.XGBRegressor(** params)
                
                trained_xgb_model = MultiOutputRegressor(optuna_xgb_model).fit(X_train , y_train)

                prediction = trained_xgb_model.predict(X_test)

                MAPE = mean_absolute_percentage_error(prediction, y_test)
                print('MAPE: ', MAPE)
                MAE = np.mean(np.abs(prediction - y_test))
                print('MAE: ', MAE)

                MSE = mean_squared_error(prediction, y_test)
                
                return MSE

            study = optuna.create_study(direction='minimize')

            study.optimize(objective, n_trials = 100 )

            trial = study.best_trial

            joblib.dump(study, "optuna_studies/XGBOOST/experimento_full/study_XGBOOST_mejoresparam_resample_"+str(resample_item)+"_horizon_"+str(horizon_item)+"_previous_"+str(previous_item)+".pkl")

            # VALIDATION

            # study = joblib.load("optuna_studies/XGBOOST/experimento_full/study_XGBOOST_resample_"+str(resample_item)+"_horizon_"+str(horizon_item)+"_previous_"+str(previous_item)+".pkl")
            params = study.best_params

            predicciones = {}
            metricas = {}
            
            for i in range(1,11):

                estacion = i

                X_train_val, y_train_val = utils_xgboost.get_validation(df_test, 
                                                                        estacion,
                                                                        variables, 
                                                                        dependent, 
                                                                        train_test_samples, 
                                                                        input_samples, 
                                                                        output_samples, 
                                                                        number_of_features,
                                                                        step)
                xgb_model = xgb.XGBRegressor(** params)

                trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

                prediction = trained_xgb_model.predict(X_train_val)
                
                # guardamos los valores predecidos vs reales en un diccionario
                
                predicciones[i] = {'real' : y_train_val, 'prediccion': prediction}


                pickle.dump(trained_xgb_model, open('models/models_xgboost/experimento_full/xgboost_mejoresparam_estacion_' + str(i) + '_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.pkl', 'wb'))

                mean_real = y_train_val.mean()
                mean_prediction = prediction.mean()

                MAPE = mean_absolute_percentage_error(prediction, y_train_val)
                MAE = mean_absolute_error(prediction, y_train_val)
                RMSE = mean_squared_error(prediction, y_train_val, squared = False)
                
                # guardamos las metricas en un diccionario
                
                metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

            df_metricas = pd.DataFrame.from_dict(metricas)

            df_metricas.to_csv('metrics/XGBOOST/experimento_full/xgboost_experimento_full_mejoresparam_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.csv')

            list_dfs = []

            for station in range(1,11):
                d = {'TARGET': predicciones[station]['real'].flatten(), 'FORECAST': predicciones[station]['prediccion'].flatten()}
                df_aux = pd.DataFrame(data = d)
                df_aux['ESTACION'] = station
                list_dfs.append(df_aux)

            df_predicciones = pd.concat(list_dfs)

            df_predicciones.to_csv('datos/experimento_full/predicciones_10estaciones_mejoresparam_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.csv')

            print('study_'+ str(resample_item)+'_'+str(horizon_item)+'_'+ str(previous_item)+' done!')

# CORRELACION MAYOR AL 20%

In [None]:
datos = pd.read_csv('datos/230127_train_ESTACIONES.csv', parse_dates=['FECHAHORA'])
validacion = pd.read_csv('datos/230127_test_ESTACIONES.csv', parse_dates=['FECHAHORA'])

horizon = [0.25, 0.5, 1]
resample = [5, 30, 60]
previous = [2, 7, 15]

# Start resampling

for resample_item in resample:

    df_test = validacion.copy()
    df_train = datos.copy()

    df_test.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
                'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
                'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

    df_train.drop(['ANHO', 'DIA', 'MES', 'HORA', 'MINUTO', 'DIA_TRAF_COD', 'TRAFICO_COD', 'TIPO_COD',
                'MEDICION_DIA', 'MP1_ANTERIOR', 'MP2_5_ANTERIOR', 'MP10_ANTERIOR',
                'TEMPERATURA_PRONOSTICO', 'HUMEDAD_PRONOSTICO', 'PRESION_PRONOSTICO'], axis = 1, inplace = True)

    df_train = df_train.set_index('FECHAHORA', drop = True)
    df_test = df_test.set_index('FECHAHORA', drop = True)

    lista_resample = []

    if resample_item != 5: 

        r_i = str(resample_item) + 'T'

        for station in range(1,11):
            df_aux = df_train[df_train['ESTACION'] == station]

            df_aux = df_aux.resample(r_i).mean()
            lista_resample.append(df_aux)

        df_train = pd.concat(lista_resample)

        lista_resample = []

        for station in range(1,11):
            df_aux = df_test[df_test['ESTACION'] == station]

            df_aux = df_aux.resample(r_i).mean()
            lista_resample.append(df_aux)

        df_test = pd.concat(lista_resample)

    df_test['ANHO'] = df_test.index.year
    df_test['MES'] = df_test.index.month
    df_test['DIA'] = df_test.index.day
    df_test['HORA'] = df_test.index.hour
    df_test['MINUTO'] = df_test.index.minute
    df_test['FECHAHORA'] = df_test.index
    df_test.reset_index(inplace = True, drop = True)

    df_train['ANHO'] = df_train.index.year
    df_train['MES'] = df_train.index.month
    df_train['HORA'] = df_train.index.hour
    df_train['MINUTO'] = df_train.index.minute
    df_train['DIA'] = df_train.index.day
    df_train['FECHAHORA'] = df_train.index
    df_train.reset_index(inplace = True, drop = True)

    df_test.to_csv('datos/experimento_full/test_resample_'+str(resample_item)+'_minutes.csv')
    df_train.to_csv('datos/experimento_full/train_resample_'+str(resample_item)+'_minutes.csv')

    # OPTUNA

    for previous_item in previous:
        for horizon_item in horizon:
            
            estacion = 4

            variables = ['MES', 'MP1', 'MP2_5', 'MP10', 'AQI_MP10', 'AQI_MP2_5', 'HUMEDAD', 
                        'PRESION', 'TEMPERATURA','DIA_SEM']

            dependent = ['AQI_MP2_5']

            number_of_features = len(variables)

            training_days = previous_item
            forecast_days = horizon_item

            if resample_item == 5:
                samples_per_day = 288
            elif resample_item == 30:
                samples_per_day = 48
            elif resample_item == 60:
                samples_per_day = 24

            step = forecast_days*samples_per_day

            train_months = relativedelta(months = 12)

            input_samples = int(samples_per_day * training_days) # cantidad de muestras en 7 dias
            output_samples = int(samples_per_day * forecast_days) # cantidad de muestras en 1 dia
            train_test_samples = int(input_samples + output_samples) # cantidad de datos para el train_test

            X_train, y_train, X_test, y_test = utils_xgboost.get_everything(df_train, 
                                                                estacion,
                                                                train_months, 
                                                                variables, 
                                                                dependent, 
                                                                train_test_samples, 
                                                                input_samples, 
                                                                output_samples, 
                                                                number_of_features,
                                                                step)

            def objective(trial):
    
                params = {
                    'max_depth': trial.suggest_int('max_depth', 1, 9),
                    'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 1.0),
                    'n_estimators': trial.suggest_int('n_estimators', 10, 300),
                    'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
                    'gamma': trial.suggest_loguniform('gamma', 1e-5, 1.0),
                    'subsample': trial.suggest_loguniform('subsample', 0.01, 1.0),
                    'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
                }
                
                optuna_xgb_model = xgb.XGBRegressor(** params)
                
                trained_xgb_model = MultiOutputRegressor(optuna_xgb_model).fit(X_train , y_train)

                prediction = trained_xgb_model.predict(X_test)

                MAPE = mean_absolute_percentage_error(prediction, y_test)
                print('MAPE: ', MAPE)
                MAE = np.mean(np.abs(prediction - y_test))
                print('MAE: ', MAE)

                MSE = mean_squared_error(prediction, y_test)
                
                return MSE

            study = optuna.create_study(direction='minimize')

            study.optimize(objective, n_trials = 100 )

            trial = study.best_trial

            joblib.dump(study, "optuna_studies/XGBOOST/experimento_full/study_XGBOOST_mejoresparam_resample_"+str(resample_item)+"_horizon_"+str(horizon_item)+"_previous_"+str(previous_item)+".pkl")

            # VALIDATION

            # study = joblib.load("optuna_studies/XGBOOST/experimento_full/study_XGBOOST_resample_"+str(resample_item)+"_horizon_"+str(horizon_item)+"_previous_"+str(previous_item)+".pkl")
            params = study.best_params

            predicciones = {}
            metricas = {}
            
            for i in range(1,11):

                estacion = i

                X_train_val, y_train_val = utils_xgboost.get_validation(df_test, 
                                                                        estacion,
                                                                        variables, 
                                                                        dependent, 
                                                                        train_test_samples, 
                                                                        input_samples, 
                                                                        output_samples, 
                                                                        number_of_features,
                                                                        step)
                xgb_model = xgb.XGBRegressor(** params)

                trained_xgb_model = MultiOutputRegressor(xgb_model).fit(X_train , y_train)

                prediction = trained_xgb_model.predict(X_train_val)
                
                # guardamos los valores predecidos vs reales en un diccionario
                
                predicciones[i] = {'real' : y_train_val, 'prediccion': prediction}


                pickle.dump(trained_xgb_model, open('models/models_xgboost/experimento_full/xgboost_mejoresparam_estacion_' + str(i) + '_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.pkl', 'wb'))

                mean_real = y_train_val.mean()
                mean_prediction = prediction.mean()

                MAPE = mean_absolute_percentage_error(prediction, y_train_val)
                MAE = mean_absolute_error(prediction, y_train_val)
                RMSE = mean_squared_error(prediction, y_train_val, squared = False)
                
                # guardamos las metricas en un diccionario
                
                metricas[i] = {'MAE': MAE, "MAPE": MAPE, 'RMSE': RMSE, 'Media real' : mean_real, 'Media predecida': mean_prediction}

            df_metricas = pd.DataFrame.from_dict(metricas)

            df_metricas.to_csv('metrics/XGBOOST/experimento_full/xgboost_experimento_full_mejoresparam_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.csv')

            list_dfs = []

            for station in range(1,11):
                d = {'TARGET': predicciones[station]['real'].flatten(), 'FORECAST': predicciones[station]['prediccion'].flatten()}
                df_aux = pd.DataFrame(data = d)
                df_aux['ESTACION'] = station
                list_dfs.append(df_aux)

            df_predicciones = pd.concat(list_dfs)

            df_predicciones.to_csv('datos/experimento_full/predicciones_10estaciones_mejoresparam_resample_'+str(resample_item)+'_horizon_'+str(horizon_item)+'_previous_'+str(previous_item)+'.csv')

            print('study_'+ str(resample_item)+'_'+str(horizon_item)+'_'+ str(previous_item)+' done!')