# SETUP

In [3]:
%%bash
pip install --upgrade pip tensorflow keras &> pip.log
pip install jupyterlab-vim pydot 'tensorflow[and-cuda]' &> /dev/null >> pip.log
apt-get install graphviz -y &> apt.log

## Libraries

In [5]:
# System
import os
import glob
import shutil
import copy
import re
from datetime import datetime
import logging

# Data
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Data processing
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Model
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from keras import Input, Model, Sequential
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Dropout, GRU, Conv1D, MaxPooling1D, Flatten
from keras.utils import plot_model
from keras.saving import load_model
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
from keras.optimizers import Adam
from keras.losses import MeanAbsoluteError
from keras.losses import MeanAbsoluteError, MeanSquaredError
import keras.backend as K

# Custom libraries written by myself
from src.drawing import plot_1_data, plot_2_data
from src.prediction_model.lstm import predictLSTM, evaluateLSTM
from src.reduction_model.lstm_s2s import LSTMSeq2SeqReduction
from src.reduction_model.gru_s2s import GRUSeq2SeqReduction
from src.reduction_model.cnnlstm_s2s import CNNLSTMSeq2SeqReduction
from src.prediction_model.lstm import predictLSTM, evaluateLSTM

# Configuration reader
from src.config_reader import ConfigurationReader

# Checking Tensorflow
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))

2.19.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


## Configuration

In [6]:
global_conf = ConfigurationReader("/le_thanh_van_118/workspace/hiep_workspace/model_params.json")
print(global_conf)

# Clear all temp folders
def cleanDir(input_dir):
    if os.path.exists(input_dir):
        shutil.rmtree(input_dir)
    os.makedirs(input_dir)

cleanDir(global_conf.general["model_info_dir"])
cleanDir(global_conf.general["model_checkpoints_dir"])

{
    "general": {
        "model_info_dir": "/le_thanh_van_118/workspace/hiep_workspace/model_info_dir",
        "model_checkpoints_dir": "/le_thanh_van_118/workspace/hiep_workspace/model_checkpoints"
    },
    "prediction": {
        "n_past": 7,
        "n_future": 1,
        "epochs": 20,
        "batch_size": 128
    },
    "reduction": {
        "n_past": 7,
        "n_future": 1,
        "epochs": 20,
        "batch_size": 128,
        "min_number_of_features": 18
    }
}


# MPAIR DATA

## Load dataset

* Every row has a tuple (i, j) which marks a "cell" in the Ho Chi Minh City grid map

| i/j | 0 | 1 | 2 | ... |
|-----|---|---|---|-----|
| 0   |   |   |   |     |
| 1   |   |   |   |     |
| 2   |   |   |   |     |
| ... |   |   |   |     |

* Every cell has its own geographical characteristics
* Every cell represent values of a specific date and specific location

### Raw data

In [7]:
df_mpair_raw = pd.read_csv("MPair.csv")
df_mpair_raw

Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER,NDVI,AOD,AIMODEL
0,0,0,11.149747,106.300443,2018-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,-9999.000000
1,0,1,11.149747,106.323330,2018-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,-9999.000000
2,0,2,11.149747,106.346217,2018-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,0.156327,-9999.000000
3,0,3,11.149747,106.369103,2018-01-01,24.740000,25.460614,65.144661,2.448070,2.448070,...,7.265194e+01,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.167487,-9999.000000
4,0,4,11.149747,106.391990,2018-01-01,23.760931,25.462688,65.080498,2.438784,2.438784,...,1.107789e+02,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827,,0.141881,-9999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236845,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,3.072120e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.000000,14.998473
2236846,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,4.355371e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.000000,15.201084
2236847,34,32,10.385233,107.032816,2022-12-31,-9999.000000,25.601173,-9999.000000,142.227615,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,0.000000
2236848,34,33,10.385233,107.055703,2022-12-31,-9999.000000,25.560753,-9999.000000,129.865280,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,0.000000


In [8]:
df_mpair_raw.columns

Index(['i', 'j', 'lat', 'lon', 'time', 'PM25_3km', 'TMP', 'RH', 'HPBL', 'WSPD',
       'PRES2M', 'POP', 'ROAD_DEN_1km', 'ROAD_LEN_1km', 'PRIM_ROAD_LEN_1km',
       'NEAR_DIST', 'BARELAND', 'BUILTUP', 'CROPLAND', 'GRASSLAND',
       'TREECOVER', 'WATER', 'NDVI', 'AOD', 'AIMODEL'],
      dtype='object')

### Statistics

In [9]:
df_mpair_stat = df_mpair_raw.describe()

  return umr_sum(a, axis, dtype, out, keepdims, initial, where)
  sqr = _ensure_numeric((avg - values) ** 2)


In [10]:
df_mpair_stat.iloc[:, list(range(0,12))]

Unnamed: 0,i,j,lat,lon,PM25_3km,TMP,RH,HPBL,WSPD,PRES2M,POP,ROAD_DEN_1km
count,2236850.0,2236850.0,2236850.0,2236850.0,2222980.0,2236850.0,2236850.0,2236850.0,2236850.0,2236850.0,2236850.0,2236850.0
mean,17.0,17.0,10.76749,106.6895,-1753.033,-1362.026,-1661.22,-940.5143,-1725.172,85462.06,-15436.09,-5.388961e+37
std,10.09951,10.09951,0.2270946,0.231144,3818.433,3463.384,3810.681,3635.621,3781.445,38280.34,39468.0,1.24232e+38
min,0.0,0.0,10.38523,106.3004,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-99999.0,-3.4028230000000003e+38
25%,8.0,8.0,10.56512,106.4835,7.578345,25.44704,68.50341,342.2755,1.769627,100490.2,123.6246,0.6451674
50%,17.0,17.0,10.76749,106.6895,11.95651,26.57075,78.68076,475.2339,2.600184,100776.0,545.4543,4.697491
75%,26.0,26.0,10.96986,106.8955,16.88797,27.44273,86.41529,605.6798,3.479031,100980.5,1298.022,8.067769
max,34.0,34.0,11.14975,107.0786,102.8131,33.23537,99.56794,1568.239,12.81917,101595.6,56840.37,34.9134


In [11]:
df_mpair_stat.iloc[:, list(range(12,24))]

Unnamed: 0,ROAD_LEN_1km,PRIM_ROAD_LEN_1km,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER,NDVI,AOD,AIMODEL
count,1822348.0,394416.0,2236850.0,2236850.0,2236850.0,2236850.0,2236850.0,2236850.0,2236850.0,779702.0,2235890.0,2236850.0
mean,10670.29,131.058152,-inf,12615.59,163871.3,141637.7,106014.1,249892.3,79168.29,-2501.601072,-inf,-5994.904
std,9746.935,832.895664,inf,24721.16,221723.3,192205.2,97314.47,229657.3,163304.0,7461.592816,,4906.239
min,0.0,0.0,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-inf,-9999.0
25%,4510.964,0.0,27.99286,555.2163,2151.326,3157.222,10932.64,15182.08,54.24711,-9999.0,-9999.0,-9999.0
50%,8658.898,0.0,70.90635,4823.034,79651.72,47372.17,100526.0,211707.4,6301.0,1718.917701,-9999.0,-9999.0
75%,14435.56,0.0,162.5445,17301.09,222150.8,218575.4,166599.8,399678.8,82678.68,4856.35363,0.4555829,16.00679
max,54608.36,8309.18,6176.464,233034.1,981710.0,858431.6,515140.8,972563.8,1249447.0,7861.422519,5.469734,45.73877


In [195]:
df_mpair_raw.isnull().sum()

i                          0
j                          0
lat                        0
lon                        0
time                       0
PM25_3km              406378
TMP                   309855
RH                    386505
HPBL                  309855
WSPD                  386505
PRES2M                309855
POP                   392590
ROAD_DEN_1km          354244
ROAD_LEN_1km          414502
PRIM_ROAD_LEN_1km    1842434
NEAR_DIST             383460
BARELAND              390764
BUILTUP               390764
CROPLAND              390764
GRASSLAND             390764
TREECOVER             390764
WATER                 390764
NDVI                 1840608
AOD                  1351263
AIMODEL              1342600
dtype: int64

### Remove invalid data

In [201]:
mpair_start_i, mpair_end_i = df_mpair_raw.iloc[0]["i"], df_mpair_raw.iloc[-1]["i"]
mpair_start_j, mpair_end_j = df_mpair_raw.iloc[0]["j"], df_mpair_raw.iloc[-1]["j"]
print(f"start_i = {mpair_start_i}, end_i = {mpair_end_i}\nstart_j = {mpair_start_j}, end_j = {mpair_end_j}")

start_i = 0, end_i = 34
start_j = 0, end_j = 34


In [213]:
numerical_columns = df_mpair_raw.columns[5:]
dftest = df_mpair_raw[(df_mpair_raw["i"] == 0) & (df_mpair_raw["j"] == 1)]
dftest

Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER,NDVI,AOD,AIMODEL
1,0,1,11.149747,106.32333,2018-01-01,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
1226,0,1,11.149747,106.32333,2018-01-02,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
2451,0,1,11.149747,106.32333,2018-01-03,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
3676,0,1,11.149747,106.32333,2018-01-04,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
4901,0,1,11.149747,106.32333,2018-01-05,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2230726,0,1,11.149747,106.32333,2022-12-27,-9999.0,22.852858,-9999.0,405.038086,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,0.0
2231951,0,1,11.149747,106.32333,2022-12-28,-9999.0,24.134665,-9999.0,501.514008,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.235372,0.0
2233176,0,1,11.149747,106.32333,2022-12-29,-9999.0,24.032589,-9999.0,565.651306,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.109087,0.0
2234401,0,1,11.149747,106.32333,2022-12-30,-9999.0,24.167263,-9999.0,701.299072,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,0.0


In [217]:
mask = dftest[numerical_columns] <= -9999
print(dftest.size)
print(mask.values.sum())

45650
30880


In [203]:
data_by_locations_dir = "data_by_locations"
cleanDir(data_by_locations_dir)

all_locations_df = []
numerical_columns = df_mpair_raw.columns[5:]

for i in range(mpair_start_i, mpair_end_i + 1):
    for j in range(mpair_start_i, mpair_end_j + 1):
        # Get dataframe of current location i, j
        df_mpair_current_ij = df_mpair_raw[(df_mpair_raw["i"] == i) & (df_mpair_raw["j"] == j)]

        # Condition for invalid value
        mask = df_mpair_current_ij[numerical_columns] <= -9999
        total_matches = mask.values.sum()
        print(total_matches)

        # Check if the values of location i, j are invalid
        if df_mpair_current_ij[column_to_check].isna().all():
            # Save to file for debugging purpose
            df_mpair_current_ij.to_csv(f"{data_by_locations_dir}/df_mpair_{i}_{j}.csv", index=False)
            print(f"Location {i},{j} have invalid values")
        else:
            # Save to file for debugging purpose
            df_mpair_current_ij.to_csv(f"{data_by_locations_dir}/df_mpair_{i}_{j}_valid.csv", index=False)
            all_locations_df.append(df_mpair_current_ij)
            
print(f"Number of valid location i, j: {len(all_locations_df)}")
df_mpair_raw = pd.concat(all_locations_df)
df_mpair_raw

Number of valid location i, j: 1225


Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER,NDVI,AOD,AIMODEL
0,0,0,11.149747,106.300443,2018-01-01,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
1225,0,0,11.149747,106.300443,2018-01-02,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
2450,0,0,11.149747,106.300443,2018-01-03,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
3675,0,0,11.149747,106.300443,2018-01-04,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
4900,0,0,11.149747,106.300443,2018-01-05,-9999.0,-9999.000000,-9999.0,-9999.000000,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231949,34,34,10.385233,107.078590,2022-12-27,-9999.0,25.045456,-9999.0,486.050629,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,0.0
2233174,34,34,10.385233,107.078590,2022-12-28,-9999.0,25.773758,-9999.0,412.733246,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,0.0
2234399,34,34,10.385233,107.078590,2022-12-29,-9999.0,26.275782,-9999.0,475.094452,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,0.210801,0.0
2235624,34,34,10.385233,107.078590,2022-12-30,-9999.0,25.651445,-9999.0,312.502716,-9999.0,...,-1.797693e+308,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.0,-9999.000000,0.0


### Convert invalid values to NaN

Value less than or equal to -9999 are invalid
* -340282300000000032739046872047385837568.0
* -1797693000000000049799130911535464311773856769...
* -9999
* ...

It means those values are not collected

In [190]:
df_mpair_raw.iloc[1]

i                                                                    0
j                                                                    1
lat                                                          11.149747
lon                                                          106.32333
time                                                        2018-01-01
PM25_3km                                                       -9999.0
TMP                                                            -9999.0
RH                                                             -9999.0
HPBL                                                           -9999.0
WSPD                                                           -9999.0
PRES2M                                                         -9999.0
POP                                                           -99999.0
ROAD_DEN_1km                -340282300000000032739046872047385837568.0
ROAD_LEN_1km                                                       0.0
PRIM_R

In [191]:
numerical_columns = df_mpair_raw.columns[5:]
df_mpair_raw[(df_mpair_raw[numerical_columns] <= -9999)] = np.nan
df_mpair_raw

Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER,NDVI,AOD,AIMODEL
0,0,0,11.149747,106.300443,2018-01-01,,,,,,...,,,,,,,,,,
1,0,1,11.149747,106.323330,2018-01-01,,,,,,...,,,,,,,,,,
2,0,2,11.149747,106.346217,2018-01-01,,,,,,...,,,,,,,,,0.156327,
3,0,3,11.149747,106.369103,2018-01-01,24.740000,25.460614,65.144661,2.448070,2.448070,...,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.167487,
4,0,4,11.149747,106.391990,2018-01-01,23.760931,25.462688,65.080498,2.438784,2.438784,...,110.778870,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827,,0.141881,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236845,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,3072.120248,,,,,,,,,14.998473
2236846,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,4355.371213,,,,,,,,,15.201084
2236847,34,32,10.385233,107.032816,2022-12-31,,25.601173,,142.227615,,...,,,,,,,,,,0.000000
2236848,34,33,10.385233,107.055703,2022-12-31,,25.560753,,129.865280,,...,,,,,,,,,,0.000000


### Drop columns

In [187]:
# Based on the above null information, remove the columns which have number of null values larger than threshold
null_info = df_mpair_raw.isnull().sum().to_dict()
threshold_null = 1000000
columns_to_drop = [k for k, v in null_info.items() if v >= threshold_null]
columns_to_drop += ["AIMODEL"]
print(f"Drop columns: {columns_to_drop}")

# Start dropping
df_mpair_raw.drop(columns_to_drop, axis=1, inplace=True)
df_mpair_raw

Drop columns: ['PRIM_ROAD_LEN_1km', 'NDVI', 'AOD', 'AIMODEL', 'AIMODEL']


Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,POP,ROAD_DEN_1km,ROAD_LEN_1km,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER
0,0,0,11.149747,106.300443,2018-01-01,,,,,,...,,,0.0000,,,,,,,
1,0,1,11.149747,106.323330,2018-01-01,,,,,,...,,,0.0000,,,,,,,
2,0,2,11.149747,106.346217,2018-01-01,,,,,,...,,,0.0000,,,,,,,
3,0,3,11.149747,106.369103,2018-01-01,24.740000,25.460614,65.144661,2.448070,2.448070,...,180.23763,4.945109,8360.3150,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765
4,0,4,11.149747,106.391990,2018-01-01,23.760931,25.462688,65.080498,2.438784,2.438784,...,188.42590,3.702707,6613.6245,110.778870,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236845,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,,0.000000,,3072.120248,,,,,,
2236846,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,,0.000000,,4355.371213,,,,,,
2236847,34,32,10.385233,107.032816,2022-12-31,,25.601173,,142.227615,,...,,,0.0000,,,,,,,
2236848,34,33,10.385233,107.055703,2022-12-31,,25.560753,,129.865280,,...,,,0.0000,,,,,,,


### Filter out the cells with invalid values

For every cell (tuple i, j), if its "PM25_3km" is all NaN, remove it

In [163]:
data_by_locations_dir = "data_by_locations"
cleanDir(data_by_locations_dir)

all_locations_df = []
column_to_check = "PM25_3km"
for i in range(mpair_start_i, mpair_end_i + 1):
    for j in range(mpair_start_i, mpair_end_j + 1):
        # Get dataframe of current location i, j
        df_mpair_current_ij = df_mpair_raw[(df_mpair_raw["i"] == i) & (df_mpair_raw["j"] == j)]

        # Save to file for debugging purpose
        df_mpair_current_ij.to_csv(f"{data_by_locations_dir}/df_mpair_{i}_{j}.csv", index=False)

        # Check if the values of location i, j are invalid
        if df_mpair_current_ij[column_to_check].isna().all():
            print(f"Location {i},{j} have invalid values")
        else:
            all_locations_df.append(df_mpair_current_ij)
            
print(f"Number of valid location i, j: {len(all_locations_df)}")
df_mpair_raw = pd.concat(all_locations_df)
df_mpair_raw

Location 0,0 have invalid values
Location 0,1 have invalid values
Location 0,2 have invalid values
Location 0,32 have invalid values
Location 0,33 have invalid values
Location 0,34 have invalid values
Location 1,0 have invalid values
Location 1,1 have invalid values
Location 1,2 have invalid values
Location 1,32 have invalid values
Location 1,33 have invalid values
Location 1,34 have invalid values
Location 2,0 have invalid values
Location 2,1 have invalid values
Location 2,2 have invalid values
Location 2,32 have invalid values
Location 2,33 have invalid values
Location 2,34 have invalid values
Location 3,0 have invalid values
Location 3,1 have invalid values
Location 3,2 have invalid values
Location 3,32 have invalid values
Location 3,33 have invalid values
Location 3,34 have invalid values
Location 4,0 have invalid values
Location 4,1 have invalid values
Location 4,2 have invalid values
Location 4,32 have invalid values
Location 4,33 have invalid values
Location 4,34 have invalid va

Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,POP,ROAD_DEN_1km,ROAD_LEN_1km,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER
3,0,3,11.149747,106.369103,2018-01-01,24.740000,25.460614,65.144661,2.448070,2.448070,...,180.23763,4.945109,8360.315,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765
1228,0,3,11.149747,106.369103,2018-01-02,34.290001,24.739857,66.633795,1.877883,1.877883,...,180.23763,4.945109,8360.315,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765
2453,0,3,11.149747,106.369103,2018-01-03,28.250000,26.696542,74.392069,1.809577,1.809577,...,180.23763,4.945109,8360.315,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765
3678,0,3,11.149747,106.369103,2018-01-04,30.870001,27.088334,76.627008,458.706106,1.855307,...,180.23763,4.945109,8360.315,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765
4903,0,3,11.149747,106.369103,2018-01-05,15.430000,25.933456,89.905732,378.742727,2.542371,...,180.23763,4.945109,8360.315,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2231944,34,29,10.385233,106.964156,2022-12-27,,25.725489,72.518082,377.298889,4.144145,...,583.99274,0.051030,,1944.460182,63.465354,,,42.310236,10.577559,421822.428565
2233169,34,29,10.385233,106.964156,2022-12-28,,25.644854,77.947990,236.179031,3.832138,...,583.99274,0.051030,,1944.460182,63.465354,,,42.310236,10.577559,421822.428565
2234394,34,29,10.385233,106.964156,2022-12-29,,25.985573,79.508736,438.569580,3.340813,...,583.99274,0.051030,,1944.460182,63.465354,,,42.310236,10.577559,421822.428565
2235619,34,29,10.385233,106.964156,2022-12-30,,25.691380,73.568398,311.416473,3.874893,...,583.99274,0.051030,,1944.460182,63.465354,,,42.310236,10.577559,421822.428565


In [164]:
df_mpair_raw.isnull().sum()

i                    0
j                    0
lat                  0
lon                  0
time                 0
PM25_3km         14403
TMP               3024
RH                3024
HPBL              3024
WSPD              3024
PRES2M            3024
POP               5478
ROAD_DEN_1km     85822
ROAD_LEN_1km    401720
NEAR_DIST            0
BARELAND         43824
BUILTUP          65736
CROPLAND         49302
GRASSLAND        41998
TREECOVER         5478
WATER           116864
dtype: int64

### Data preparation

* Convert "time" columns to Pandas datetime
* Set "time" column as index
* Sort data by "time"
* Lowercase all column names

In [21]:
# Convert "time" columns to Pandas datetime
df_mpair_raw = df_mpair_raw.assign(time=pd.to_datetime(df_mpair_raw["time"]))

# Set "time" column as index
df_mpair_raw.set_index("time", inplace=True)

# Lower case all column names
df_mpair_raw = df_mpair_raw.rename(columns={name: name.lower() for name in df_mpair_raw.columns})

# Print
df_mpair_raw

Unnamed: 0_level_0,i,j,lat,lon,pm25_3km,tmp,rh,hpbl,wspd,pres2m,...,prim_road_len_1km,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-01,18,15,10.745004,106.643743,30.845961,25.045032,71.926500,2.339113,2.339113,100930.458102,...,3663.8025,6.909909,3286.840750,882259.549565,2312.864958,10563.271194,43083.900165,58415.405794,1742.916058,1.999975
2018-01-01,18,17,10.745004,106.689516,30.939211,25.082829,71.464797,2.237909,2.237909,100915.732074,...,4633.5020,18.555218,6924.069379,742455.993765,8958.648184,37665.567156,123921.617453,79841.648016,2087.080168,1.248172
2018-01-01,19,15,10.722519,106.643743,27.156076,25.022745,72.305654,2.302683,2.302683,100933.214552,...,2284.2550,70.356268,7978.455406,427846.224207,63148.502720,114955.708607,269190.714441,115011.679587,3690.303474,1.971691
2018-01-02,16,17,10.789976,106.689516,52.154430,24.712185,73.392725,1.881799,1.881799,100874.453618,...,4548.5900,19.481558,1056.256477,878404.088663,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,0.914681
2018-01-02,17,17,10.767490,106.689516,49.599480,24.698446,73.641532,1.879498,1.879498,100880.492220,...,8309.1800,17.300117,1895.437302,879546.385595,229.889184,2906.313289,68310.042059,46929.703983,1876.516122,1.189908
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-12-29,17,17,10.767490,106.689516,27.491894,25.897594,67.093735,827.818787,2.823764,101331.265625,...,8309.1800,17.300117,1895.437302,879546.385595,229.889184,2906.313289,68310.042059,46929.703983,1876.516122,0.239528
2022-12-29,17,18,10.767490,106.712403,23.209021,27.186907,59.783350,880.392334,2.464040,101357.992188,...,4869.2314,28.249591,25943.414161,502457.783018,27350.370312,69966.235794,203949.897411,170512.111735,2389.140541,0.168873
2022-12-29,18,15,10.745004,106.643743,29.272182,26.616703,59.667720,621.919495,2.330549,101354.976562,...,3663.8025,6.909909,3286.840750,882259.549565,2312.864958,10563.271194,43083.900165,58415.405794,1742.916058,0.186290
2022-12-29,18,17,10.745004,106.689516,28.683048,25.632614,68.206148,565.613037,2.169897,101353.648438,...,4633.5020,18.555218,6924.069379,742455.993765,8958.648184,37665.567156,123921.617453,79841.648016,2087.080168,0.199954


* Convert "time" columns to Pandas datetime
* Set "time" column as index
* Sort data by "time"
* Lowercase all column names

In [None]:
# Convert "time" columns to Pandas datetime
df_mpair_raw = df_mpair_raw.assign(time=pd.to_datetime(df_mpair_raw["time"]))

# Set "time" column as index
df_mpair_raw.set_index("time", inplace=True)

# Lower case all column names
df_mpair_raw = df_mpair_raw.rename(columns={name: name.lower() for name in df_mpair_raw.columns})

# Print
df_mpair_raw

## Preprocessing

In [None]:
df_mpair = copy.deepcopy(df_mpair_raw)

### Extract columns: time, i, j

In [None]:
mpair_i_column = df_mpair.loc[:,"i"]
mpair_j_column = df_mpair.loc[:, "j"]
print(mpair_i_column)
print(mpair_j_column)

In [None]:
# Get the time indices
mpair_time_indices = df_mpair.index
mpair_time_indices

### Split into features and labels
* Label: pm25_3km
* Features: other columns
    * Dynamic features
    * Static features

In [None]:
mpair_label = "pm25_3km"
mpair_features = [item for item in df_mpair.columns if item not in [mpair_label,  "i", "j"]]
mpair_features

### Define static features and dynamic features

In [None]:
mpair_dyn_feats = ["pm25", "pm25_3km", "tmp", "rh", "hpbl", "wspd", "aod"]
mpair_stat_feats = list(set(mpair_features) - set(mpair_dyn_feats))
print(mpair_dyn_feats)
print(mpair_stat_feats)

In [None]:
X_mpair = df_mpair.loc[:, mpair_features]
y_mpair = df_mpair.loc[:, mpair_label].to_frame()

In [None]:
X_mpair

In [None]:
y_mpair

In [None]:
plot_1_data(data=y_mpair, datalabel="data", xlabel="Time step", ylabel="pm25_3km")

### Data normalization

In [None]:
mpair_features_scaler = MinMaxScaler()
X_mpair_scaled = mpair_features_scaler.fit_transform(X_mpair)
X_mpair_scaled = pd.DataFrame(X_mpair_scaled, columns=X_mpair.columns)
X_mpair_scaled

In [None]:
mpair_label_scaler = MinMaxScaler(feature_range=(0, 1))
y_mpair_scaled = mpair_label_scaler.fit_transform(y_mpair)
y_mpair_scaled = pd.DataFrame(y_mpair_scaled, columns=y_mpair.columns)
y_mpair_scaled

## Dimensionality Reduction Comparation

To compare the effect of each method, we will do the following things:
* Use the pm25_3km as the label
* Use other columns as the input features
* Reduce the dimension of input data using above methods
* Pass the processed input data to a model (e.g. LSTM, ...)

### Evaluation board

In [None]:
evaluation_board = {'dim_reduction_method': [], 'dim_after_reduction': [], 'prediction':[], 'mae': []}

### Range of dimension

In [None]:
RANGE_OF_DIMENSION = range(MIN_NUMBER_OF_FEATURES, X_mpair_scaled.shape[1])
for n in RANGE_OF_DIMENSION:
  print(n, end=' ')

### Prediction + LSTM-Seq2Seq

#### LSTM-Seq2Seq

In [None]:
class LSTMSeq2SeqReduction(object):
  def __init__(self, X_scaled, test_percentage=0.2, latent_dim=8, epochs=10, batch_size=10, verbose=0, model_name=None):
    # Hyper parameters
    self._verbose = verbose
    self._test_percentage = test_percentage
    self._latent_dim = latent_dim
    self._epochs = epochs
    self._batch_size = batch_size
    # Data
    self._X_scaled = X_scaled
    self._X_scaled_reframed = None
    self._y_scaled_reframed = None
    self._n_features = self._X_scaled.shape[1]
    self._prepare_data()
    # Models
    self._model = self._define_model()
    if model_name:
      self._model.name = model_name
    self._encoder_model = None

  # Get model information
  def get_model_info(self):
    print(self._model.summary())
    plot_model(self._model, to_file=f"{model_info_dir}/{self._model.name}.png", show_shapes=True, dpi=100)

  # Get encoder model information
  def get_encoder_model_info(self):
    print(self._encoder_model.summary())
    plot_model(self._encoder_model, to_file=f"{model_info_dir}/{self._encoder_model.name}.png", show_shapes=True, dpi=100)

  # Main execution method
  def execute(self):
    # Set logging to ERROR only
    tf.get_logger().setLevel(logging.ERROR)
    print("LSTMSeq2SeqReduction.execute(): is called") if self._verbose else None
    mae = self._train_model()
    print(f"LSTMSeq2SeqReduction.execute(): mae = {mae}") if self._verbose else None
    self._encoder_model = self._define_encoder_model()
    # Set logging to INFO only
    tf.get_logger().setLevel(logging.INFO)
    return self._encode_data()

  def _prepare_data(self):
    print("LSTMSeq2SeqReduction._prepare_data(): is called") if self._verbose else None
    # Padding data
    padded_before = pd.DataFrame([self._X_scaled.iloc[0]] * SEQ2SEQ_N_PAST)
    padded_after = pd.DataFrame([self._X_scaled.iloc[-1]] * (SEQ2SEQ_N_FUTURE - 1))
    X_scaled_padded = pd.concat([padded_before, self._X_scaled, padded_after], axis=0, ignore_index=True)
    # Reframe data
    self._X_scaled_reframed, self._y_scaled_reframed = reframePastFuture(X_scaled_padded, SEQ2SEQ_N_PAST, SEQ2SEQ_N_FUTURE)

  # Define the model
  def _define_model(self):
    print("LSTMSeq2SeqReduction._define_model(): is called") if self._verbose else None
    # Encoder layers
    encoder_inputs = Input(shape=(SEQ2SEQ_N_PAST, self._n_features))
    encoder_lstm_1 = LSTM(100, return_sequences=True, activation="relu")(encoder_inputs)
    encoder_outputs, state_h, state_c = LSTM(50, return_state=True, activation="relu")(encoder_lstm_1)
    encoder_dense = Dense(self._latent_dim)(encoder_outputs)
    # Repeat layer
    decoder_repeat_vector = RepeatVector(SEQ2SEQ_N_FUTURE)(encoder_dense)
    # Decoder layers
    decoder_lstm_1 = LSTM(50, return_sequences=True, activation="relu")(decoder_repeat_vector, initial_state=[state_h, state_c])
    decoder_lstm_2 = LSTM(100, return_sequences=True, activation="relu")(decoder_lstm_1)
    decoder_outputs = TimeDistributed(Dense(self._n_features))(decoder_lstm_2)
    # Compile the model
    lstm_seq2seq = Model(encoder_inputs, decoder_outputs)
    lstm_seq2seq.compile(optimizer=Adam(learning_rate=0.001), loss=MeanAbsoluteError())
    return lstm_seq2seq

  # Train and evaluate the model
  def _train_model(self):
    print("LSTMSeq2SeqReduction._train_model(): is called") if self._verbose else None
    X_train, X_test, y_train, y_test = splitTrainTestTimeSeries(self._X_scaled_reframed, self._y_scaled_reframed, test_percentage=self._test_percentage)
    self._model.fit(X_train, y_train,
                    epochs=self._epochs,
                    batch_size=self._batch_size,
                    validation_data=(X_test, y_test),
                    shuffle=False,
                    verbose=self._verbose)
    y_predicted= self._model.predict(X_test, verbose=self._verbose)
    mae = self._model.evaluate(y_predicted, y_test, verbose=self._verbose)
    return mae

  # Define the Encoder model
  def _define_encoder_model(self):
    print("LSTMSeq2SeqReduction._define_encoder_model(): is called") if self._verbose else None
    # Encoder only
    encoder_inputs = Input(shape=(SEQ2SEQ_N_PAST, self._n_features))
    encoder_lstm_1 = self._model.layers[1](encoder_inputs)
    encoder_outputs, _, _ = self._model.layers[2](encoder_lstm_1)
    encoder_dense = self._model.layers[3](encoder_outputs)
    # Compile the model
    encoder_lstm_s2s = Model(encoder_inputs, encoder_dense)
    encoder_lstm_s2s.compile(optimizer=Adam(learning_rate=0.001), loss=MeanAbsoluteError())
    encoder_lstm_s2s.name = self._model.name + "_encoder"
    return encoder_lstm_s2s

  # Reduce dimension with trained Encoder
  def _encode_data(self):
    print("LSTMSeq2SeqReduction._encode_data(): is called") if self._verbose else None
    return pd.DataFrame(self._encoder_model.predict(self._X_scaled_reframed, verbose=self._verbose))

#### Doing the loop

In [None]:
loopresults = {i:{"mae": None, "encoded_data": None, "evaluation_data": None} for i in RANGE_OF_DIMENSION}

# Loop between min and (number of features - 1) to choose what number is the best
for n in RANGE_OF_DIMENSION:
  # Apply Seq2seq
  lstms2s = LSTMSeq2SeqReduction(X_mpair_scaled,
                                 test_percentage=0.2,
                                 latent_dim=n,
                                 epochs=SEQ2SEQ_EPOCHS, batch_size=SEQ2SEQ_BATCH_SIZE,
                                 verbose=0,
                                 model_name=f"mpair_lstms2s_dim_reduction_{n}_features")
  X_mpair_scaled_lstm_s2s_encoded = lstms2s.execute()

  lstms2s.get_model_info()
  lstms2s.get_encoder_model_info()

  # Prediction
  y_pred, y_test = predictLSTM(X_mpair_scaled_lstm_s2s_encoded, y_mpair_scaled,
                             MPAIR_LSTM_N_PAST, MPAIR_LSTM_N_FUTURE, MPAIR_LSTM_EPOCHS, MPAIR_LSTM_BATCH_SIZE,
                             model_name=f"mpair_lstm_prediction_with_lstms2s_dim_reduction_{n}_features",
                             verbose=0)

  # Evaluation
  all_days_inv_y_pred, all_days_inv_y_test, all_days_mae, avg_mae = evaluateLSTM(y_pred, y_test, mpair_label_scaler, verbose=0)

  # Logging
  loopresults[n]["mae"] = avg_mae
  loopresults[n]["encoded_data"] = X_mpair_scaled_lstm_s2s_encoded
  loopresults[n]["evaluation_data"] = (all_days_inv_y_pred, all_days_inv_y_test, all_days_mae, avg_mae)

In [None]:
for n in loopresults.keys():
  print(f"N = {n} - MAE = {loopresults[n]['mae']}")

values = [loopresults[n]["mae"] for n in RANGE_OF_DIMENSION]
plt.plot(RANGE_OF_DIMENSION, values)
plt.xticks(RANGE_OF_DIMENSION)
plt.xlabel("Number of components")
plt.yticks(np.arange(min(values), max(values) + 0.1, 0.2))
plt.ylabel("Mean Absolute Error (MAE)")
plt.show()

best_mae = 1000
best_num_of_components = 0
best_encoded_data = None
best_evaluation_data = None
for n in loopresults.keys():
  if loopresults[n]["mae"] < best_mae:
    best_num_of_components = n
    best_mae = loopresults[n]["mae"]
    best_encoded_data = loopresults[n]["encoded_data"]
    best_evaluation_data = loopresults[n]["evaluation_data"]

evaluation_board['dim_reduction_method'] += ["LSTM-Seq2Seq"]
evaluation_board['dim_after_reduction'] += [best_num_of_components]
evaluation_board['prediction'] += ["LSTM"]
evaluation_board['mae'] += [best_mae]

In [None]:
all_days_inv_y_pred, all_days_inv_y_test, _, _ = best_evaluation_data
for day in range(MPAIR_LSTM_N_FUTURE):
  inv_y_pred = all_days_inv_y_pred[day,:,:]
  inv_y_test = all_days_inv_y_test[day,:,:]
  print(f"inv_y_pred.shape = {inv_y_pred.shape}\ninv_y_test.shape = {inv_y_test.shape}")
  plot_2_data(data1=inv_y_pred,
              data2=inv_y_test,
              datalabel1="Prediction",
              datalabel2="Actual",
              xlabel="Time step",
              ylabel="PM2.5")

In [None]:
df_evaluation_board = pd.DataFrame(evaluation_board)
df_evaluation_board

In [None]:
stophere

#### Prepare reduced data

In [None]:
best_encoded_data

### Prediction + GRU-Seq2Seq

#### GRU-Seq2Seq

In [None]:
class GRUSeq2SeqReduction(object):
  def __init__(self, X_scaled, test_percentage=0.2, latent_dim=8, epochs=10, batch_size=10, verbose=0):
    # Hyper parameters
    self._verbose = verbose
    self._test_percentage = test_percentage
    self._latent_dim = latent_dim
    self._epochs = epochs
    self._batch_size = batch_size

    # Data
    self._X_scaled = X_scaled
    self._X_scaled_reframed = None
    self._y_scaled_reframed = None
    self._n_features = self._X_scaled.shape[1]
    self._prepare_data()

    # Models
    self._model = self._define_model()
    self._encoder_model = None

  # Get model information
  def get_model_info(self):
    print(self._model.summary())
    plot_model(self._model, to_file=f"{model_info_dir}/{self._model.name}.png", show_shapes=True, dpi=100)

  # Get encoder model information
  def get_encoder_model_info(self):
    print(self._encoder_model.summary())
    plot_model(self._encoder_model, to_file=f"{model_info_dir}/{self._encoder_model.name}.png", show_shapes=True, dpi=100)

  # Main execution method
  def execute(self):
    # Set logging to ERROR only
    tf.get_logger().setLevel(logging.ERROR)
    print("GRUSeq2SeqReduction.execute(): is called") if self._verbose else None
    mae = self._train_model()
    print(f"GRUSeq2SeqReduction.execute(): mae = {mae}") if self._verbose else None
    self._encoder_model = self._define_encoder_model()
    # Set logging to INFO only
    tf.get_logger().setLevel(logging.INFO)
    return self._encode_data()

  def _prepare_data(self):
    print("GRUSeq2SeqReduction._prepare_data(): is called") if self._verbose else None
    # Padding data
    padded_before = pd.DataFrame([self._X_scaled.iloc[0]] * SEQ2SEQ_N_PAST)
    padded_after = pd.DataFrame([self._X_scaled.iloc[-1]] * (SEQ2SEQ_N_FUTURE - 1))
    X_scaled_padded = pd.concat([padded_before, self._X_scaled, padded_after], axis=0, ignore_index=True)
    # Reframe data
    self._X_scaled_reframed, self._y_scaled_reframed = reframePastFuture(X_scaled_padded, SEQ2SEQ_N_PAST, SEQ2SEQ_N_FUTURE)

  # Define the model
  def _define_model(self):
    print("GRUSeq2SeqReduction._define_model(): is called") if self._verbose else None
    # Encoder layers
    encoder_inputs = Input(shape=(SEQ2SEQ_N_PAST, self._n_features))
    encoder_gru_1 = GRU(100, return_sequences=True, activation="relu")(encoder_inputs)
    encoder_outputs, state_h = GRU(50, return_state=True, activation="relu")(encoder_gru_1)
    encoder_dense = Dense(self._latent_dim, activation="softmax")(encoder_outputs)
    # Repeat layer
    decoder_repeat_vector = RepeatVector(SEQ2SEQ_N_FUTURE)(encoder_dense)
    # Decoder layers
    decoder_gru_1 = GRU(50, return_sequences=True, activation="relu")(decoder_repeat_vector, initial_state=state_h)
    decoder_gru_2 = GRU(100, return_sequences=True, activation="relu")(decoder_gru_1)
    decoder_outputs = TimeDistributed(Dense(self._n_features))(decoder_gru_2)
    # Compile the model
    gru_seq2seq = Model(encoder_inputs, decoder_outputs)
    gru_seq2seq.compile(optimizer=Adam(learning_rate=0.001), loss=MeanAbsoluteError())
    return gru_seq2seq

  # Train and evaluate the model
  def _train_model(self):
    print("GRUSeq2SeqReduction._train_model(): is called") if self._verbose else None
    X_train, X_test, y_train, y_test = splitTrainTestTimeSeries(self._X_scaled_reframed, self._y_scaled_reframed, test_percentage=self._test_percentage)
    self._model.fit(X_train, y_train,
                    epochs=self._epochs,
                    batch_size=self._batch_size,
                    validation_data=(X_test, y_test),
                    shuffle=False,
                    verbose=self._verbose)
    y_predicted= self._model.predict(X_test, verbose=self._verbose)
    mae = self._model.evaluate(y_predicted, y_test, verbose=self._verbose)
    return mae

  # Define the Encoder model
  def _define_encoder_model(self):
    print("GRUSeq2SeqReduction._define_encoder_model(): is called") if self._verbose else None
    # Encoder only
    encoder_inputs = Input(shape=(SEQ2SEQ_N_PAST, self._n_features))
    encoder_gru_1 = self._model.layers[1](encoder_inputs)
    encoder_outputs, _ = self._model.layers[2](encoder_gru_1)
    encoder_dense = self._model.layers[3](encoder_outputs)
    # Compile the model
    encoder_gru_s2s = Model(encoder_inputs, encoder_dense)
    encoder_gru_s2s.compile(optimizer=Adam(learning_rate=0.001), loss=MeanAbsoluteError())
    encoder_gru_s2s.name = self._model.name + "_encoder"
    return encoder_gru_s2s

  # Reduce dimension with trained Encoder
  def _encode_data(self):
    print("GRUSeq2SeqReduction._encode_data(): is called") if self._verbose else None
    return pd.DataFrame(self._encoder_model.predict(self._X_scaled_reframed, verbose=self._verbose))

#### Doing the loop

In [None]:
loopresults = {i:{"mae": None, "encoded_data": None, "evaluation_data": None} for i in RANGE_OF_DIMENSION}

# Loop between min and (number of features - 1) to choose what number is the best
for n in RANGE_OF_DIMENSION:
  # Apply Seq2seq
  grus2s = GRUSeq2SeqReduction(X_mpair_scaled, test_percentage=0.2, latent_dim=n, epochs=SEQ2SEQ_EPOCHS, batch_size=SEQ2SEQ_BATCH_SIZE, verbose=0)
  X_mpair_scaled_gru_s2s_encoded = grus2s.execute()

  grus2s.get_model_info()
  grus2s.get_encoder_model_info()

  # Prediction
  y_pred, y_test = predictLSTM(X_mpair_scaled_gru_s2s_encoded, y_mpair_scaled,
                             MPAIR_LSTM_N_PAST, MPAIR_LSTM_N_FUTURE, MPAIR_LSTM_EPOCHS, MPAIR_LSTM_BATCH_SIZE,
                             model_name=f"mpair_lstm_grus2s_dim_reduction_{n}_features",
                             verbose=0)

  # Evaluation
  all_days_inv_y_pred, all_days_inv_y_test, all_days_mae, avg_mae = evaluateLSTM(y_pred, y_test, mpair_label_scaler, verbose=0)

  # Logging
  print(f"N = {n} - MAE = {avg_mae}")
  loopresults[n]["mae"] = avg_mae
  loopresults[n]["encoded_data"] = X_mpair_scaled_gru_s2s_encoded
  loopresults[n]["evaluation_data"] = (all_days_inv_y_pred, all_days_inv_y_test, all_days_mae, avg_mae)

In [None]:
for n in loopresults.keys():
  print(f"N = {n} - MAE = {loopresults[n]['mae']}")

values = [loopresults[n]["mae"] for n in RANGE_OF_DIMENSION]
plt.plot(RANGE_OF_DIMENSION, values)
plt.xticks(RANGE_OF_DIMENSION)
plt.xlabel("Number of components")
plt.yticks(np.arange(min(values), max(values) + 0.1, 0.1))
plt.ylabel("Mean Absolute Error (MAE)")
plt.show()

best_mae = 1000
best_num_of_components = 0
best_encoded_data = None
best_evaluation_data = None
for n in loopresults.keys():
  if loopresults[n]["mae"] < best_mae:
    best_num_of_components = n
    best_mae = loopresults[n]["mae"]
    best_encoded_data = loopresults[n]["encoded_data"]
    best_evaluation_data = loopresults[n]["evaluation_data"]

evaluation_board['dim_reduction_method'] += ["GRU-Seq2Seq"]
evaluation_board['dim_after_reduction'] += [best_num_of_components]
evaluation_board['prediction'] += ["LSTM"]
evaluation_board['mae'] += [best_mae]

In [None]:
all_days_inv_y_pred, all_days_inv_y_test, _, _ = best_evaluation_data
for day in range(MPAIR_LSTM_N_FUTURE):
  inv_y_pred = all_days_inv_y_pred[day,:,:]
  inv_y_test = all_days_inv_y_test[day,:,:]
  print(f"inv_y_pred.shape = {inv_y_pred.shape}\ninv_y_test.shape = {inv_y_test.shape}")
  plot_2_data(data1=inv_y_pred,
              data2=inv_y_test,
              datalabel1="Prediction",
              datalabel2="Actual",
              xlabel="Time step",
              ylabel="PM2.5")

In [None]:
df_evaluation_board = pd.DataFrame(evaluation_board)
df_evaluation_board

#### Prepare reduced data

### Prediction + CNN-LSTM Seq2seq

#### CNN-LSTM Seq2seq model

In [None]:
class CNNLSTMSeq2SeqReduction(object):
  def __init__(self, X_scaled, test_percentage=0.2, latent_dim=8, epochs=10, batch_size=10, verbose=0):
    # Hyper parameters
    self._verbose = verbose
    self._test_percentage = test_percentage
    self._latent_dim = latent_dim
    self._epochs = epochs
    self._batch_size = batch_size
    # Data
    self._X_scaled = X_scaled
    self._X_scaled_reframed = None
    self._y_scaled_reframed = None
    self._n_features = self._X_scaled.shape[1]
    self._prepare_data()
    # Models
    self._model = self._define_model()
    self._encoder_model = None

  # Get model information
  def get_model_info(self):
    print(self._model.summary())
    plot_model(self._model, to_file=f"{model_info_dir}/{self._model.name}.png", show_shapes=True, dpi=100)

  # Get encoder model information
  def get_encoder_model_info(self):
    print(self._encoder_model.summary())
    plot_model(self._encoder_model, to_file=f"{model_info_dir}/{self._encoder_model.name}.png", show_shapes=True, dpi=100)

  # Main execution method
  def execute(self):
    # Set logging to ERROR
    tf.get_logger().setLevel(logging.ERROR)
    print("CNNLSTMSeq2SeqReduction.execute(): is called") if self._verbose else None
    mae = self._train_model()
    print(f"CNNLSTMSeq2SeqReduction.execute(): mae = {mae}") if self._verbose else None
    self._encoder_model = self._define_encoder_model()
    # Set logging to INFO
    tf.get_logger().setLevel(logging.INFO)
    return self._encode_data()

  def _prepare_data(self):
    print("CNNLSTMSeq2SeqReduction._prepare_data(): is called") if self._verbose else None
    # Padding data
    padded_before = pd.DataFrame([self._X_scaled.iloc[0]] * SEQ2SEQ_N_PAST)
    padded_after = pd.DataFrame([self._X_scaled.iloc[-1]] * (SEQ2SEQ_N_FUTURE - 1))
    X_scaled_padded = pd.concat([padded_before, self._X_scaled, padded_after], axis=0, ignore_index=True)
    # Reframe data
    self._X_scaled_reframed, self._y_scaled_reframed = reframePastFuture(X_scaled_padded, SEQ2SEQ_N_PAST, SEQ2SEQ_N_FUTURE)

  # Define the model
  def _define_model(self):
    print("CNNLSTMSeq2SeqReduction._define_model(): is called") if self._verbose else None
    # Encoder layers
    encoder_inputs = Input(shape=(SEQ2SEQ_N_PAST, self._n_features))
    encoder_cnn_1 = Conv1D(filters=24, kernel_size=3, activation="relu")(encoder_inputs)
    encoder_cnn_2 = Conv1D(filters=12, kernel_size=3, activation="relu")(encoder_cnn_1)
    encoder_max_pooling = MaxPooling1D(pool_size=2)(encoder_cnn_2)
    encoder_flatten = Flatten()(encoder_max_pooling)
    encoder_repeat_vector = RepeatVector(SEQ2SEQ_N_FUTURE)(encoder_flatten)
    encoder_outputs, state_h, state_c = LSTM(50, return_state=True, activation="relu")(encoder_repeat_vector)
    encoder_dense = Dense(self._latent_dim)(encoder_outputs)
    # Repeat layer
    decoder_repeat_vector = RepeatVector(SEQ2SEQ_N_FUTURE)(encoder_dense)
    # Decoder layers
    decoder_lstm_1 = LSTM(50, return_sequences=True, activation="relu")(decoder_repeat_vector, initial_state=[state_h, state_c])
    decoder_dense_1 = TimeDistributed(Dense(24, activation="relu"))(decoder_lstm_1)
    decoder_outputs = TimeDistributed(Dense(self._n_features))(decoder_dense_1)
    # Compile the model
    cnn_lstm_seq2seq = Model(encoder_inputs, decoder_outputs)
    cnn_lstm_seq2seq.compile(optimizer=Adam(learning_rate=0.001), loss=MeanAbsoluteError())
    cnn_lstm_seq2seq
    return cnn_lstm_seq2seq

  # Train and evaluate the model
  def _train_model(self):
    print("CNNLSTMSeq2SeqReduction._train_model(): is called") if self._verbose else None
    X_train, X_test, y_train, y_test = splitTrainTestTimeSeries(self._X_scaled_reframed, self._y_scaled_reframed, test_percentage=self._test_percentage)
    self._model.fit(X_train, y_train,
                    epochs=self._epochs,
                    batch_size=self._batch_size,
                    validation_data=(X_test, y_test),
                    shuffle=False,
                    verbose=self._verbose)
    y_predicted= self._model.predict(X_test, verbose=self._verbose)
    mae = self._model.evaluate(y_predicted, y_test, verbose=self._verbose)
    return mae

  # Define the Encoder model
  def _define_encoder_model(self):
    print("CNNLSTMSeq2SeqReduction._define_encoder_model(): is called") if self._verbose else None
    # Encoder only
    encoder_inputs = Input(shape=(SEQ2SEQ_N_PAST, self._n_features))
    encoder_cnn_1 = self._model.layers[1](encoder_inputs)
    encoder_cnn_2 = self._model.layers[2](encoder_cnn_1)
    encoder_max_pooling = self._model.layers[3](encoder_cnn_2)
    encoder_flatten = self._model.layers[4](encoder_max_pooling)
    encoder_repeat_vector = self._model.layers[5](encoder_flatten)
    encoder_outputs, _, _ = self._model.layers[6](encoder_repeat_vector)
    encoder_dense = self._model.layers[7](encoder_outputs)

    # Compile the model
    encoder_cnn_lstm_s2s = Model(encoder_inputs, encoder_dense)
    encoder_cnn_lstm_s2s.compile(optimizer=Adam(), loss=MeanAbsoluteError())
    encoder_cnn_lstm_s2s.name = self._model.name + "_encoder"
    return encoder_cnn_lstm_s2s

  # Reduce dimension with trained Encoder
  def _encode_data(self):
    print("CNNLSTMSeq2SeqReduction._encode_data(): is called") if self._verbose else None
    return pd.DataFrame(self._encoder_model.predict(self._X_scaled_reframed, verbose=self._verbose))

#### Doing the loop

In [None]:
loopresults = {i:{"mae": None, "encoded_data": None, "evaluation_data": None} for i in RANGE_OF_DIMENSION}

# Loop between min and (number of features - 1) to choose what number is the best
for n in RANGE_OF_DIMENSION:
  # Apply Seq2seq
  cnnlstms2s = CNNLSTMSeq2SeqReduction(X_mpair_scaled, test_percentage=0.2, latent_dim=n, epochs=SEQ2SEQ_EPOCHS, batch_size=SEQ2SEQ_BATCH_SIZE, verbose=0)
  X_mpair_scaled_cnnlstm_s2s_encoded = cnnlstms2s.execute()

  cnnlstms2s.get_model_info()
  cnnlstms2s.get_encoder_model_info()

  # Prediction
  y_pred, y_test = predictLSTM(X_mpair_scaled_cnnlstm_s2s_encoded, y_mpair_scaled,
                             MPAIR_LSTM_N_PAST, MPAIR_LSTM_N_FUTURE, MPAIR_LSTM_EPOCHS, MPAIR_LSTM_BATCH_SIZE,
                             model_name=f"mpair_lstm_cnnlstm_s2s_dim_reduction_{n}_features",
                             verbose=0)

  # Evaluation
  all_days_inv_y_pred, all_days_inv_y_test, all_days_mae, avg_mae = evaluateLSTM(y_pred, y_test, mpair_label_scaler, verbose=0)

  # Logging
  loopresults[n]["mae"] = avg_mae
  loopresults[n]["encoded_data"] = X_mpair_scaled_cnnlstm_s2s_encoded
  loopresults[n]["evaluation_data"] = (all_days_inv_y_pred, all_days_inv_y_test, all_days_mae, avg_mae)

In [None]:
for n in loopresults.keys():
  print(f"N = {n} - MAE = {loopresults[n]['mae']}")

values = [loopresults[n]["mae"] for n in RANGE_OF_DIMENSION]
plt.plot(RANGE_OF_DIMENSION, values)
plt.xticks(RANGE_OF_DIMENSION)
plt.xlabel("Number of components")
plt.yticks(np.arange(min(values), max(values) + 0.1, 0.2))
plt.ylabel("Mean Absolute Error (MAE)")
plt.show()

best_mae = 1000
best_num_of_components = 0
best_encoded_data = None
best_evaluation_data = None
for n in loopresults.keys():
  if loopresults[n]["mae"] < best_mae:
    best_num_of_components = n
    best_mae = loopresults[n]["mae"]
    best_encoded_data = loopresults[n]["encoded_data"]
    best_evaluation_data = loopresults[n]["evaluation_data"]

evaluation_board['dim_reduction_method'] += ["CNN-LSTM-Seq2Seq"]
evaluation_board['dim_after_reduction'] += [best_num_of_components]
evaluation_board['prediction'] += ["LSTM"]
evaluation_board['mae'] += [best_mae]

In [None]:
all_days_inv_y_pred, all_days_inv_y_test, _, _ = best_evaluation_data
for day in range(MPAIR_LSTM_N_FUTURE):
  inv_y_pred = all_days_inv_y_pred[day,:,:]
  inv_y_test = all_days_inv_y_test[day,:,:]
  print(f"inv_y_pred.shape = {inv_y_pred.shape}\ninv_y_test.shape = {inv_y_test.shape}")
  plot_2_data(data1=inv_y_pred,
              data2=inv_y_test,
              datalabel1="Prediction",
              datalabel2="Actual",
              xlabel="Time step",
              ylabel="PM2.5")

In [None]:
df_evaluation_board = pd.DataFrame(evaluation_board)
df_evaluation_board

#### Prepare reduced data