# Reframe dataset

During this process I will select a specific pollutant to forecast along with exegenous variables that can influence its values, and reframe the current information as a supervised learning dataset. This step is necessary so that the LTSM can be correctly trained with the data it's given.

In [20]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler

In [21]:
filename = "semadet-aire-2023"
filepath = f"datasets/feature_eng/{filename}-interpolated.csv"
df = pd.read_csv(filepath, parse_dates=[0], index_col=0)

## Select pollutant to predict and exegenous variables

The first dataset to reframe will contain PM2.5 as the independent variable and temperature, relative humidity, wind speed and wind direction as the dependent variables.

In [22]:
features = ["pm25", "tmp", "rh", "ws", "wd"]
pollutant = "pm25"

In [23]:
def select_df_features(df:pd.DataFrame, features:list):
    df_select = pd.DataFrame()
    for feature in features:
        df_select[feature] = df[feature]
    return df_select

In [24]:
df_select = select_df_features(df, features)

In [25]:
df_select.head(3)

Unnamed: 0_level_0,pm25,tmp,rh,ws,wd
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,18.194167,17.595652,56.725,1.98875,198.013158
2023-01-02,18.194167,16.38125,56.725,2.904583,235.478057
2023-01-03,24.248333,16.889474,52.979167,2.114167,225.967935


## Normalize data

Once the data is separated, it's important to normalize all the values. The data will be transformed to a common scale to improve the model's performance. In this case, the data held for each feature will be scaled to have values between 0 and 1.

In [26]:
data_scaler = MinMaxScaler(feature_range=(0,1))
data_norm = data_scaler.fit_transform(df_select.values)

In [27]:
data_norm[:5]

array([[0.13167801, 0.27024266, 0.98321007, 0.25369939, 0.56250986],
       [0.13167801, 0.16490893, 0.98321007, 0.43416258, 0.66898425],
       [0.20868475, 0.20899078, 0.89336398, 0.2784125 , 0.6419567 ],
       [0.44371696, 0.3435578 , 0.8205077 , 0.04655261, 0.48990875],
       [0.74056655, 0.4209641 , 0.56386168, 0.11338474, 0.41487319]])

## Timeseries to supervised learning

In [28]:
def series_to_supervised(data, column_names, n_in=1, n_out=1, dropnan=True):
    n_vars = 1 if type(data) is list else data.shape[1]
    df = pd.DataFrame(data)
    cols, names = list(), list()
    
    # Input sequence (t-n, ... t-1)
    for i in range(n_in, 0, -1):
        cols.append(df.shift(i))
        names += [(f"{column_names[j]}(t-{i})") for j in range(n_vars)]
    
    # Forecast sequence (t, t+1, ... t+n)
    for i in range(0, n_out):
        cols.append(df.shift(-i))
        if i == 0:
            names += [(f"{column_names[j]}(t)") for j in range(n_vars)]
        else:
            names += [(f"{column_names[j]}(t+{i})") for j in range(n_vars)]
    
    # Put it all together
    agg = pd.concat(cols, axis=1)
    agg.columns = names
    
    # Drop rows with NaN values
    if dropnan:
        agg.dropna(inplace=True)
    
    return agg

In [29]:
data_reframed = series_to_supervised(data_norm, features, 1, 1)

In [30]:
# drop columns we don't want to predict (i.e, drop all features that aren't pm25 for time t)
data_reframed.drop(data_reframed.columns[[6,7,8,9]], axis=1, inplace=True)

In [31]:
data_reframed.head(3)

Unnamed: 0,pm25(t-1),tmp(t-1),rh(t-1),ws(t-1),wd(t-1),pm25(t)
1,0.131678,0.270243,0.98321,0.253699,0.56251,0.131678
2,0.131678,0.164909,0.98321,0.434163,0.668984,0.208685
3,0.208685,0.208991,0.893364,0.278412,0.641957,0.443717


## Divide dataset

In [32]:
def divide_series(df: pd.DataFrame, train=0.65, val:float=0.15):
    data_len = len(df)
    train_size = int(data_len * train)
    val_size = int(data_len * val)
    
    train_df = pd.DataFrame()
    val_df = pd.DataFrame()
    test_df = pd.DataFrame()
    
    for feature in df.columns:
        train_df[feature] = df[feature][:train_size]
        val_df[feature] = df[feature][train_size:train_size + val_size]
        test_df[feature] = df[feature][train_size + val_size:]
    

    return train_df, val_df, test_df

In [33]:
train_df, val_df, test_df = divide_series(data_reframed)

In [34]:
print(f"Train set len: {len(train_df)}")
print(f"Validation set len: {len(val_df)}")
print(f"Test set len: {len(test_df)}")

Train set len: 236
Validation set len: 54
Test set len: 74


In [35]:
train_df.head(3)

Unnamed: 0,pm25(t-1),tmp(t-1),rh(t-1),ws(t-1),wd(t-1),pm25(t)
1,0.131678,0.270243,0.98321,0.253699,0.56251,0.131678
2,0.131678,0.164909,0.98321,0.434163,0.668984,0.208685
3,0.208685,0.208991,0.893364,0.278412,0.641957,0.443717


In [36]:
train_df.to_csv(f"datasets/reframe/train_{pollutant}.csv")
val_df.to_csv(f"datasets/reframe/val_{pollutant}.csv")
test_df.to_csv(f"datasets/reframe/test_{pollutant}.csv")