# SETUP

In [1]:
USE_GPU = False

In [2]:
!pip install --upgrade pip tensorflow keras &> pip.log
!pip install jupyterlab-vim pydot >> pip.log
!pip install 'tensorflow[and-cuda]' >> pip.log
!pip install 'tensorflow-cpu' >> pip.log

!apt-get install graphviz -y &> apt.log

[0m

## Libraries

In [3]:
# System
import os
import glob
import shutil
import copy
import re
from datetime import datetime
import logging

# Data
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Data processing
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Model
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from keras import Input, Model, Sequential
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Dropout, GRU, Conv1D, MaxPooling1D, Flatten
from keras.utils import plot_model
from keras.saving import load_model
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
from keras.optimizers import Adam
from keras.losses import MeanAbsoluteError
from keras.losses import MeanAbsoluteError, MeanSquaredError
from keras.models import load_model
import keras.backend as K

# Custom libraries written by myself
from src.plot import plot_1_data, plot_2_data
from src.prediction_model.lstm import predictLSTM, evaluateLSTM
from src.reduction_model.lstm_s2s import LSTMSeq2SeqReduction
from src.reduction_model.gru_s2s import GRUSeq2SeqReduction
from src.reduction_model.cnnlstm_s2s import CNNLSTMSeq2SeqReduction
from src.prediction_model.lstm import predictLSTM, evaluateLSTM
from src.time_series_utils import splitTrainTestTimeSeries, reframePastFuture

# Configuration reader
from src.config_reader import ConfigurationReader

# Configure device
if not USE_GPU:
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

# Checking Tensorflow
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_physical_devices('CPU'))

E0000 00:00:1753183155.808324  214894 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753183155.814476  214894 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1753183155.829259  214894 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753183155.829277  214894 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753183155.829279  214894 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1753183155.829280  214894 computation_placer.cc:177] computation placer already registered. Please check linka

2.19.0
[]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


## Configuration

In [4]:
confReader = ConfigurationReader("/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/model_params.json")
print(confReader)

conf = confReader.data
print(conf)

{
    "dataset": {
        "aod2022": {
            "file_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/MatchingData2022.xlsx",
            "target_start_date": "2022-01-01",
            "target_end_date": "2022-12-31"
        },
        "aod2021": {
            "file_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/aod_data_daily.csv",
            "target_start_date": "2021-01-01",
            "target_end_date": "2021-12-31"
        },
        "mpair": {
            "file_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/MPair.csv",
            "target_start_date": "2021-01-01",
            "target_end_date": "2022-12-31",
            "station_2022_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/station2022.csv",
            "station_2018_2021_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/station2018-2021.c

# MPAIR DATA ALL LOCATIONS

## Load dataset

* Every row has a tuple (i, j) which marks a "cell" in the Ho Chi Minh City grid map

| i/j | 0 | 1 | 2 | ... |
|-----|---|---|---|-----|
| 0   |   |   |   |     |
| 1   |   |   |   |     |
| 2   |   |   |   |     |
| ... |   |   |   |     |

* Every cell has its own geographical characteristics
* Every cell represent values of a specific date and specific location

### Raw data

In [5]:
df_mpair_raw = pd.read_csv(conf["dataset"]["mpair"]["merged_data_dir_all_locations"], index_col=None)
df_mpair_raw

Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
0,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
1,0,3,11.149747,106.369103,2021-01-01,39.470001,24.043423,69.082085,485.584290,3.288929,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
2,0,3,11.149747,106.369103,2021-01-01,40.619999,25.058735,67.647362,403.936310,2.731468,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
3,0,3,11.149747,106.369103,2021-01-01,40.330002,25.554197,71.023094,704.476807,3.522596,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.555355,,
4,0,3,11.149747,106.369103,2021-01-01,50.009998,25.037054,74.029236,534.426575,3.223711,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713205,34,27,10.385233,106.918383,2022-08-06,19.874584,24.644707,76.055573,441.420624,2.561953,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
713206,34,27,10.385233,106.918383,2022-08-06,17.509365,25.071791,78.165489,403.261658,2.126233,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
713207,34,27,10.385233,106.918383,2022-08-06,18.428423,25.458431,79.421234,425.696411,1.909332,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,0.174487,,
713208,34,27,10.385233,106.918383,2022-08-06,12.645213,24.636055,76.198395,241.369736,2.530340,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,


In [6]:
df_mpair_raw.columns

Index(['i', 'j', 'lat', 'lon', 'time', 'pm25_3km', 'tmp', 'rh', 'hpbl', 'wspd',
       'pop', 'road_den_1km', 'prim_road_len_1km', 'near_dist', 'bareland',
       'builtup', 'cropland', 'grassland', 'treecover', 'water', 'ndvi', 'aod',
       'station', 'pm25'],
      dtype='object')

In [7]:
df_mpair_raw.isnull().sum()

i                         0
j                         0
lat                       0
lon                       0
time                      0
pm25_3km                  0
tmp                       0
rh                        0
hpbl                      0
wspd                      0
pop                       0
road_den_1km              0
prim_road_len_1km    708830
near_dist                 0
bareland                  0
builtup                   0
cropland                  0
grassland                 0
treecover                 0
water                     0
ndvi                 556260
aod                  429477
station              709195
pm25                 709195
dtype: int64

### Handle metadata

In [8]:
# Convert "time" columns to Pandas datetime
df_mpair_raw = df_mpair_raw.assign(time=pd.to_datetime(df_mpair_raw["time"]))

# Set "time" column as index
df_mpair_raw.set_index("time", inplace=True)

# Convert the -9999 to nan
df_mpair_raw[df_mpair_raw <= -9999] = np.nan

# Print
df_mpair_raw

Unnamed: 0_level_0,i,j,lat,lon,pm25_3km,tmp,rh,hpbl,wspd,pop,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01,0,3,11.149747,106.369103,26.040001,25.114687,63.633778,567.478943,4.660580,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
2021-01-01,0,3,11.149747,106.369103,39.470001,24.043423,69.082085,485.584290,3.288929,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
2021-01-01,0,3,11.149747,106.369103,40.619999,25.058735,67.647362,403.936310,2.731468,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
2021-01-01,0,3,11.149747,106.369103,40.330002,25.554197,71.023094,704.476807,3.522596,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.555355,,
2021-01-01,0,3,11.149747,106.369103,50.009998,25.037054,74.029236,534.426575,3.223711,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-06,34,27,10.385233,106.918383,19.874584,24.644707,76.055573,441.420624,2.561953,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
2022-08-06,34,27,10.385233,106.918383,17.509365,25.071791,78.165489,403.261658,2.126233,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
2022-08-06,34,27,10.385233,106.918383,18.428423,25.458431,79.421234,425.696411,1.909332,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,0.174487,,
2022-08-06,34,27,10.385233,106.918383,12.645213,24.636055,76.198395,241.369736,2.530340,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,


## Preprocessing

In [9]:
df_mpair = copy.deepcopy(df_mpair_raw)

### Sort data by locations with stations, then locations without stations

In [10]:
locations_with_stations = sorted(list(df_mpair.loc[:, ["i", "j", "station"]].drop_duplicates().dropna().drop(columns=["station"]).itertuples(name=None, index=None)))
print(f"locations_with_stations = {locations_with_stations}")

all_locations = sorted(list(df_mpair.loc[:, ["i", "j"]].drop_duplicates().itertuples(name=None, index=None)))
print(f"all_locations = {all_locations}")

locations_without_stations = sorted(list(set(all_locations) - set(locations_with_stations)))
print(f"locations_without_stations = {locations_without_stations}")

sorted_locations = locations_with_stations + locations_without_stations
print(f"sorted_locations = {sorted_locations}")

locations_with_stations = [(12, 22), (15, 14), (15, 18), (16, 16), (16, 17), (17, 17), (18, 14)]
all_locations = [(0, 3), (0, 4), (0, 5), (0, 6), (0, 7), (0, 8), (0, 9), (0, 10), (0, 11), (0, 12), (0, 13), (0, 14), (0, 15), (0, 16), (0, 17), (0, 18), (0, 19), (0, 20), (0, 21), (0, 22), (0, 23), (0, 24), (0, 25), (0, 26), (0, 27), (0, 28), (0, 29), (0, 30), (0, 31), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13), (1, 14), (1, 15), (1, 16), (1, 17), (1, 18), (1, 19), (1, 20), (1, 21), (1, 22), (1, 23), (1, 24), (1, 25), (1, 26), (1, 27), (1, 28), (1, 29), (1, 30), (1, 31), (2, 3), (2, 4), (2, 5), (2, 6), (2, 7), (2, 8), (2, 9), (2, 10), (2, 11), (2, 12), (2, 13), (2, 14), (2, 15), (2, 16), (2, 17), (2, 18), (2, 19), (2, 20), (2, 21), (2, 22), (2, 23), (2, 24), (2, 25), (2, 26), (2, 27), (2, 28), (2, 29), (2, 30), (2, 31), (3, 3), (3, 4), (3, 5), (3, 6), (3, 7), (3, 8), (3, 9), (3, 10), (3, 11), (3, 12), (3, 13), (3, 14), (3, 15), (3, 16), (3, 1

In [11]:
all_sorted_dfs = []
for (i, j) in sorted_locations:
    df_mpair_current_ij = df_mpair.loc[(df_mpair["i"] == i) & (df_mpair["j"] == j)]
    # Fill station data
    df_mpair_current_ij.loc[:, "station"] = df_mpair_current_ij["station"].iloc[0]
    # Fill pm25 data
    df_mpair_current_ij.loc[:, "pm25"] = df_mpair_current_ij["pm25"].fillna(df_mpair_current_ij["pm25"].mean())
    all_sorted_dfs.append(df_mpair_current_ij)

df_mpair = pd.concat(all_sorted_dfs)
df_mpair

Unnamed: 0_level_0,i,j,lat,lon,pm25_3km,tmp,rh,hpbl,wspd,pop,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-07,12,22,10.879919,106.803950,30.366625,25.749989,61.403439,533.919250,3.311250,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,,,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,46.732498,25.407879,59.282650,568.372009,2.977586,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,,0.247559,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,45.280472,26.828142,61.058357,500.588928,1.905266,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,,,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,49.504631,27.283474,62.222889,603.052551,2.034556,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,,,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,60.881470,27.231569,63.081909,524.814148,2.716187,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,,,211.0,22.054035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-06,34,27,10.385233,106.918383,19.874584,24.644707,76.055573,441.420624,2.561953,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
2022-08-06,34,27,10.385233,106.918383,17.509365,25.071791,78.165489,403.261658,2.126233,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
2022-08-06,34,27,10.385233,106.918383,18.428423,25.458431,79.421234,425.696411,1.909332,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,0.174487,,
2022-08-06,34,27,10.385233,106.918383,12.645213,24.636055,76.198395,241.369736,2.530340,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,


### Define label, static features and dynamic features

In [12]:
print(df_mpair.columns)
mpair_label = ["pm25"]
mpair_stat_feats = ['lat', 'lon', 'pop', 'road_den_1km', 'prim_road_len_1km', 'near_dist', \
                  'bareland', 'builtup', 'cropland', 'grassland', 'treecover', 'water', 'ndvi']
mpair_dyn_feats = list(set(df_mpair.columns) - set(mpair_stat_feats) - set(mpair_label) - set(["i", "j", "station", "pm25_3km"]))

print(f"mpair_label = {mpair_label}\n\
mpair_stat_feats = {mpair_stat_feats}\n\
mpair_dyn_feats = {mpair_dyn_feats}\n")

Index(['i', 'j', 'lat', 'lon', 'pm25_3km', 'tmp', 'rh', 'hpbl', 'wspd', 'pop',
       'road_den_1km', 'prim_road_len_1km', 'near_dist', 'bareland', 'builtup',
       'cropland', 'grassland', 'treecover', 'water', 'ndvi', 'aod', 'station',
       'pm25'],
      dtype='object')
mpair_label = ['pm25']
mpair_stat_feats = ['lat', 'lon', 'pop', 'road_den_1km', 'prim_road_len_1km', 'near_dist', 'bareland', 'builtup', 'cropland', 'grassland', 'treecover', 'water', 'ndvi']
mpair_dyn_feats = ['hpbl', 'aod', 'rh', 'tmp', 'wspd']



### Fill missing values

In [13]:
null_stat = df_mpair.isnull().sum()
columns_to_fill = list(null_stat[null_stat > 0].keys())
stat_feats_to_fill = set(columns_to_fill) & set(mpair_stat_feats)
dyn_feats_to_fill = set(columns_to_fill) & set(mpair_dyn_feats)
print(f"columns_to_fill = {columns_to_fill}\n\
stat_feats_to_fill = {stat_feats_to_fill}\n\
dyn_feats_to_fill = {dyn_feats_to_fill}\n")

columns_to_fill = ['prim_road_len_1km', 'ndvi', 'aod', 'station', 'pm25']
stat_feats_to_fill = {'ndvi', 'prim_road_len_1km'}
dyn_feats_to_fill = {'aod'}



In [14]:
all_filled_dfs = []
for (i, j) in sorted_locations:
    df_mpair_current_ij = df_mpair.loc[(df_mpair["i"] == i) & (df_mpair["j"] == j)]

    for feat in stat_feats_to_fill:
        df_mpair_current_ij.loc[:, feat] = df_mpair_current_ij[feat].fillna(value=0)
    for feat in dyn_feats_to_fill:
        df_mpair_current_ij.loc[:, feat] = df_mpair_current_ij[feat].replace([np.inf, -np.inf], np.nan)
        df_mpair_current_ij.loc[:, feat] = df_mpair_current_ij[feat].fillna(df_mpair_current_ij[feat].mean())

    all_filled_dfs.append(df_mpair_current_ij)

df_mpair = pd.concat(all_filled_dfs)
df_mpair

Unnamed: 0_level_0,i,j,lat,lon,pm25_3km,tmp,rh,hpbl,wspd,pop,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-07,12,22,10.879919,106.803950,30.366625,25.749989,61.403439,533.919250,3.311250,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.0,0.639324,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,46.732498,25.407879,59.282650,568.372009,2.977586,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.0,0.247559,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,45.280472,26.828142,61.058357,500.588928,1.905266,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.0,0.639324,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,49.504631,27.283474,62.222889,603.052551,2.034556,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.0,0.639324,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,60.881470,27.231569,63.081909,524.814148,2.716187,7734.3780,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.0,0.639324,211.0,22.054035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-06,34,27,10.385233,106.918383,19.874584,24.644707,76.055573,441.420624,2.561953,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.442056,,
2022-08-06,34,27,10.385233,106.918383,17.509365,25.071791,78.165489,403.261658,2.126233,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.442056,,
2022-08-06,34,27,10.385233,106.918383,18.428423,25.458431,79.421234,425.696411,1.909332,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.174487,,
2022-08-06,34,27,10.385233,106.918383,12.645213,24.636055,76.198395,241.369736,2.530340,1191.2214,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.442056,,


### Split train test

In [15]:
train_test_indicator = len(df_mpair.dropna(subset=["station"]))
df_mpair_train, df_mpair_test = df_mpair[:train_test_indicator], df_mpair[train_test_indicator:]
df_mpair_train

Unnamed: 0_level_0,i,j,lat,lon,pm25_3km,tmp,rh,hpbl,wspd,pop,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-08-07,12,22,10.879919,106.803950,30.366625,25.749989,61.403439,533.919250,3.311250,7734.378,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.000000,0.639324,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,46.732498,25.407879,59.282650,568.372009,2.977586,7734.378,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.000000,0.247559,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,45.280472,26.828142,61.058357,500.588928,1.905266,7734.378,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.000000,0.639324,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,49.504631,27.283474,62.222889,603.052551,2.034556,7734.378,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.000000,0.639324,211.0,22.054035
2021-08-07,12,22,10.879919,106.803950,60.881470,27.231569,63.081909,524.814148,2.716187,7734.378,...,16058.346676,489934.909940,20562.897823,151361.367404,282947.325347,39120.320073,0.000000,0.639324,211.0,22.054035
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-11-15,18,14,10.745004,106.620857,31.512344,25.295406,66.642159,484.068359,1.859084,45417.710,...,5573.831706,911973.882028,2465.050483,14041.052167,57668.133163,8249.960386,2437.714788,0.835369,212.0,18.843761
2021-11-15,18,14,10.745004,106.620857,22.765150,26.981958,59.168281,624.954895,2.308777,45417.710,...,5573.831706,911973.882028,2465.050483,14041.052167,57668.133163,8249.960386,2437.714788,0.723000,212.0,18.843761
2021-11-15,18,14,10.745004,106.620857,28.393860,26.910824,59.916964,638.658203,2.330085,45417.710,...,5573.831706,911973.882028,2465.050483,14041.052167,57668.133163,8249.960386,2437.714788,0.194074,212.0,18.843761
2021-11-15,18,14,10.745004,106.620857,19.753252,26.303425,56.920258,464.635223,4.181359,45417.710,...,5573.831706,911973.882028,2465.050483,14041.052167,57668.133163,8249.960386,2437.714788,0.835369,212.0,18.843761


In [16]:
df_mpair_test

Unnamed: 0_level_0,i,j,lat,lon,pm25_3km,tmp,rh,hpbl,wspd,pop,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01,0,3,11.149747,106.369103,26.040001,25.114687,63.633778,567.478943,4.660580,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,0.0,0.570883,,
2021-01-01,0,3,11.149747,106.369103,39.470001,24.043423,69.082085,485.584290,3.288929,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,0.0,0.570883,,
2021-01-01,0,3,11.149747,106.369103,40.619999,25.058735,67.647362,403.936310,2.731468,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,0.0,0.570883,,
2021-01-01,0,3,11.149747,106.369103,40.330002,25.554197,71.023094,704.476807,3.522596,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,0.0,0.555355,,
2021-01-01,0,3,11.149747,106.369103,50.009998,25.037054,74.029236,534.426575,3.223711,180.23763,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,0.0,0.570883,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-08-06,34,27,10.385233,106.918383,19.874584,24.644707,76.055573,441.420624,2.561953,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.442056,,
2022-08-06,34,27,10.385233,106.918383,17.509365,25.071791,78.165489,403.261658,2.126233,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.442056,,
2022-08-06,34,27,10.385233,106.918383,18.428423,25.458431,79.421234,425.696411,1.909332,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.174487,,
2022-08-06,34,27,10.385233,106.918383,12.645213,24.636055,76.198395,241.369736,2.530340,1191.22140,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,0.0,0.442056,,


In [17]:
X_train = df_mpair_train.drop(columns=["pm25_3km", "station", "pm25", "i", "j"])
y_train = pd.DataFrame(df_mpair_train["pm25"])
X_test = df_mpair_test.drop(columns=["pm25_3km", "station", "pm25", "i", "j"])

print(X_train.shape, y_train.shape, X_test.shape)

(5110, 18) (5110, 1) (708100, 18)


### Extract metadata columns

In [18]:
X_test_ij_columns = df_mpair_test.loc[:, ["i", "j"]]
X_test_ij_columns

Unnamed: 0_level_0,i,j
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2021-01-01,0,3
2021-01-01,0,3
2021-01-01,0,3
2021-01-01,0,3
2021-01-01,0,3
...,...,...
2022-08-06,34,27
2022-08-06,34,27
2022-08-06,34,27
2022-08-06,34,27


### Scale data

In [19]:
mpair_Xtrain_scaler = MinMaxScaler()
X_train_scaled = mpair_Xtrain_scaler.fit_transform(X_train)
pd.DataFrame(X_train_scaled, columns=X_train.columns)

Unnamed: 0,lat,lon,tmp,rh,hpbl,wspd,pop,road_den_1km,prim_road_len_1km,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod
0,1.0,1.0,0.290460,0.283754,0.334582,0.455834,0.000000,0.000000,0.0,1.000000,0.749231,0.000000,0.992527,1.000000,1.000000,0.304158,0.000000,0.187818
1,1.0,1.0,0.258031,0.235841,0.359716,0.392213,0.000000,0.000000,0.0,1.000000,0.749231,0.000000,0.992527,1.000000,1.000000,0.304158,0.000000,0.060146
2,1.0,1.0,0.392658,0.275958,0.310267,0.187749,0.000000,0.000000,0.0,1.000000,0.749231,0.000000,0.992527,1.000000,1.000000,0.304158,0.000000,0.187818
3,1.0,1.0,0.435819,0.302267,0.385016,0.212401,0.000000,0.000000,0.0,1.000000,0.749231,0.000000,0.992527,1.000000,1.000000,0.304158,0.000000,0.187818
4,1.0,1.0,0.430899,0.321674,0.327940,0.342370,0.000000,0.000000,0.0,1.000000,0.749231,0.000000,0.992527,1.000000,1.000000,0.304158,0.000000,0.187818
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5105,0.0,0.0,0.247370,0.402106,0.298215,0.178943,0.869545,0.509563,0.0,0.133739,0.225616,0.959076,0.118599,0.085386,0.039542,0.061085,0.944606,0.251707
5106,0.0,0.0,0.407238,0.233257,0.400994,0.264688,0.869545,0.509563,0.0,0.133739,0.225616,0.959076,0.118599,0.085386,0.039542,0.061085,0.944606,0.215087
5107,0.0,0.0,0.400495,0.250171,0.410991,0.268751,0.869545,0.509563,0.0,0.133739,0.225616,0.959076,0.118599,0.085386,0.039542,0.061085,0.944606,0.042716
5108,0.0,0.0,0.342920,0.182470,0.284039,0.621741,0.869545,0.509563,0.0,0.133739,0.225616,0.959076,0.118599,0.085386,0.039542,0.061085,0.944606,0.251707


In [20]:
mpair_ytrain_scaler = MinMaxScaler()
y_train_scaled = mpair_ytrain_scaler.fit_transform(y_train)
pd.DataFrame(y_train_scaled, columns=y_train.columns)

Unnamed: 0,pm25
0,0.205432
1,0.205432
2,0.205432
3,0.205432
4,0.205432
...,...
5105,0.166407
5106,0.166407
5107,0.166407
5108,0.166407


In [21]:
mpair_Xtest_scaler = MinMaxScaler()
X_test_scaled = mpair_Xtest_scaler.fit_transform(X_test)
pd.DataFrame(X_test_scaled, columns=X_test.columns)

Unnamed: 0,lat,lon,tmp,rh,hpbl,wspd,pop,road_den_1km,prim_road_len_1km,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod
0,1.0,0.000000,0.377580,0.308914,0.354281,0.477011,0.002224,0.141639,0.0,0.010802,0.062022,0.030913,0.063024,0.395138,0.703151,0.014570,0.0,0.112335
1,1.0,0.000000,0.295472,0.420709,0.301440,0.318837,0.002224,0.141639,0.0,0.010802,0.062022,0.030913,0.063024,0.395138,0.703151,0.014570,0.0,0.112335
2,1.0,0.000000,0.373292,0.391269,0.248758,0.254553,0.002224,0.141639,0.0,0.010802,0.062022,0.030913,0.063024,0.395138,0.703151,0.014570,0.0,0.112335
3,1.0,0.000000,0.411267,0.460537,0.442676,0.345783,0.002224,0.141639,0.0,0.010802,0.062022,0.030913,0.063024,0.395138,0.703151,0.014570,0.0,0.109521
4,1.0,0.000000,0.371630,0.522220,0.332955,0.311317,0.002224,0.141639,0.0,0.010802,0.062022,0.030913,0.063024,0.395138,0.703151,0.014570,0.0,0.112335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
708095,0.0,0.857143,0.341558,0.563799,0.272944,0.235005,0.020028,0.071699,0.0,0.058329,0.105059,0.049846,0.010148,0.300736,0.093912,0.457187,0.0,0.088991
708096,0.0,0.857143,0.374293,0.607092,0.248323,0.184759,0.020028,0.071699,0.0,0.058329,0.105059,0.049846,0.010148,0.300736,0.093912,0.457187,0.0,0.088991
708097,0.0,0.857143,0.403927,0.632859,0.262799,0.159747,0.020028,0.071699,0.0,0.058329,0.105059,0.049846,0.010148,0.300736,0.093912,0.457187,0.0,0.040508
708098,0.0,0.857143,0.340895,0.566729,0.143866,0.231359,0.020028,0.071699,0.0,0.058329,0.105059,0.049846,0.010148,0.300736,0.093912,0.457187,0.0,0.088991


## LSTM prediction other locations

### Common functions

In [22]:
# split a multivariate sequence into samples
# "sequences" have both features and label
def split_sequences(sequences, n_past):
	X, y = list(), list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_past
		# check if we are beyond the dataset
		if end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
		X.append(seq_x)
		y.append(seq_y)
	return np.array(X), np.array(y)

# split a multivariate sequence into samples
# "sequences" have features only
def split_sequences_no_label(sequences, n_past):
	X = list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_past
		# check if we are beyond the dataset
		if end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x = sequences[i:end_ix, :]
		X.append(seq_x)
	return np.array(X)

# split a multivariate sequence into samples
# "sequences" have features only
def split_sequences_no_label_with_future(sequences, n_past, n_future):
	X = list()
	for i in range(len(sequences)):
		# find the end of this pattern
		end_ix = i + n_past
		out_end_ix = end_ix + n_future
		# check if we are beyond the dataset
		if out_end_ix > len(sequences):
			break
		# gather input and output parts of the pattern
		seq_x = sequences[end_ix:out_end_ix, :]
		X.append(seq_x)
	return np.array(X)

# Define lstm model
def getLSTMmodel(n_past,n_future, n_features):
    model = Sequential()
    encoder_input = Input(shape=(n_past, n_features))
    encoder_lstm, state_h, state_c = LSTM(200, activation="relu", return_state=True)(encoder_input)
    decoder_input = RepeatVector(n_future)(encoder_lstm)
    decoder_lstm = LSTM(200, activation="relu", return_sequences=True)(decoder_input, initial_state = [state_h, state_c])
    decoder_dense_1 = TimeDistributed(Dense(100, activation="relu"))(decoder_lstm)
    decoder_dense_2 = TimeDistributed(Dense(1))(decoder_dense_1)
    model = Model(encoder_input, decoder_dense_2)
    model.compile(loss=MeanAbsoluteError(), optimizer=Adam(learning_rate=0.001))
    return model

# LSTM Prediction
def predictLSTM(X_train, y_train, X_test):
    # Define metadata
    n_past = conf["prediction"]["n_past"]
    n_future = conf["prediction"]["n_future"]
    n_features = X_train.shape[-1]
    
    # Dataset
    traindata = np.hstack((X_train, y_train))
    print(f"traindata.shape = {traindata.shape}")
    
    # Padded dataset
    padded_before = np.tile(traindata[0], (n_past-2, 1))
    padded_after = np.tile(traindata[0], (n_future, 1))
    padded_traindata = np.vstack((padded_before, traindata, padded_after))
    print(f"padded_traindata.shape = {padded_traindata.shape}")
    
    # Reframe data
    X_train_reframed, y_train_reframed = split_sequences(padded_traindata, n_past)
    print(f"X_train_reframed.shape = {X_train_reframed.shape}\n\
    y_train_reframed.shape = {y_train_reframed.shape}")
    
    # Fit model
    model = getLSTMmodel(n_past, n_future, n_features)
    model.fit(X_train_reframed, y_train_reframed,
            epochs=conf["prediction"]["epochs"],
            batch_size=conf["prediction"]["batch_size"],
            verbose=1,
            shuffle=False)
    
    # Prepare test data to predict
    print(f"X_test.shape = {X_test.shape}")
    padded_before = np.tile(X_test[0], (n_past-1, 1))
    X_test = np.vstack((padded_before, X_test))
    X_test_reframed = split_sequences_no_label(X_test, n_past)
    print(f"X_test_reframed.shape = {X_test_reframed.shape}")
    
    # Predict
    yhat = model.predict(X_test_reframed, verbose=1)
    inverted_yhat = mpair_ytrain_scaler.inverse_transform(yhat[:, 0, :]) # Only applicable when n_future = 1
    print(f"yhat.shape = {yhat.shape}")
    df_predicted = pd.DataFrame(inverted_yhat, index=X_test_ij_columns.index, columns=["pm25"])
    df_predicted = pd.concat([df_predicted, X_test_ij_columns], axis=1)
    print(f"df_predicted.shape = {df_predicted.shape}")
    return df_predicted

### Prediction no encoder

#### LSTM prediction

In [23]:
df_mpair_predicted = predictLSTM(X_train=X_train_scaled,
            y_train=y_train_scaled,
            X_test=X_test_scaled)

traindata.shape = (5110, 19)
padded_traindata.shape = (5116, 19)
X_train_reframed.shape = (5110, 7, 18)
    y_train_reframed.shape = (5110,)
Epoch 1/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 0.0734
Epoch 2/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0698
Epoch 3/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0575
Epoch 4/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0739
Epoch 5/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.1205
Epoch 6/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0570
Epoch 7/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.1154
Epoch 8/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0460
Epoch 9/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

#### Compare with other locations

In [24]:
list(df_mpair_train[["i", "j"]].drop_duplicates().itertuples(name=None, index=None))

[(12, 22), (15, 14), (15, 18), (16, 16), (16, 17), (17, 17), (18, 14)]

In [25]:
location_with_station = (15, 14)
nearest_location = (15, 16)
random_location = (34, 27)

df_location_with_station = df_mpair_train.loc[(df_mpair_train["i"] == location_with_station[0]) & (df_mpair_train["j"] == location_with_station[1]),
                                ["i", "j", "station", "pm25"]]
df_location_with_station

Unnamed: 0_level_0,i,j,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
...,...,...,...,...
2021-09-24,15,14,213.0,23.598236
2021-09-24,15,14,213.0,23.598236
2021-09-24,15,14,213.0,23.598236
2021-09-24,15,14,213.0,23.598236


In [26]:
df_nearest_location = df_mpair_predicted.loc[(df_mpair_predicted["i"] == nearest_location[0]) & (df_mpair_predicted["j"] == nearest_location[1])]
df_nearest_location

Unnamed: 0_level_0,pm25,i,j
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-09-24,17.948225,15,16
2021-09-24,17.993073,15,16
2021-09-24,18.019251,15,16
2021-09-24,18.047628,15,16
2021-09-24,18.046568,15,16
...,...,...,...
2021-09-25,18.098255,15,16
2021-09-25,18.095131,15,16
2021-09-25,18.078810,15,16
2021-09-25,18.050713,15,16


In [27]:
df_random_location = df_mpair_predicted.loc[(df_mpair_predicted["i"] == random_location[0]) & (df_mpair_predicted["j"] == random_location[1])]
df_random_location

Unnamed: 0_level_0,pm25,i,j
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-05,17.057821,34,27
2022-08-05,17.109018,34,27
2022-08-05,17.079197,34,27
2022-08-05,17.098488,34,27
2022-08-05,17.163628,34,27
...,...,...,...
2022-08-06,17.148371,34,27
2022-08-06,17.153545,34,27
2022-08-06,17.147495,34,27
2022-08-06,17.151619,34,27


In [28]:
print(f"location_with_station = {location_with_station}")
print(f"nearest_location = {nearest_location}")
print(f"random_location = {random_location}")
print(f"mae between location_with_station and random_location = {mean_absolute_error(df_location_with_station['pm25'], df_random_location['pm25'])}")
print(f"mae between location_with_station and nearest_location = {mean_absolute_error(df_location_with_station['pm25'], df_nearest_location['pm25'])}")

location_with_station = (15, 14)
nearest_location = (15, 16)
random_location = (34, 27)
mae between location_with_station and random_location = 7.418849144811594
mae between location_with_station and nearest_location = 6.544979996444426


### Prediction with lstms2s encoder

#### Encode data

In [29]:
lstms2s_encoder = list(filter(lambda model: ("aod_lstms2s" in model) & ("no_pm25_3km" in model) , glob.glob(f'{conf["workspace"]["best_reduction_models_dir"]}/*')))[0]
print(lstms2s_encoder)

encoder_model = load_model(lstms2s_encoder)
print(encoder_model.summary())

print(f"X_train_scaled.shape = {X_train_scaled.shape}")
padded_before = np.tile(X_train_scaled[0], (conf["reduction"]["n_past"], 1))
padded_after = np.tile(X_train_scaled[-1], (conf["reduction"]["n_future"]-1, 1))
X_train_scaled_padded = np.vstack((padded_before, X_train_scaled, padded_after))
print(f"X_train_scaled_padded.shape = {X_train_scaled_padded.shape}")
X_train_scaled_reframed = split_sequences_no_label_with_future(X_train_scaled_padded, n_past=conf["reduction"]["n_past"], n_future=conf["reduction"]["n_future"])
X_train_scaled_lstms2s_encoded = encoder_model.predict(X_train_scaled_reframed)
print(f"X_train_scaled_reframed.shape = {X_train_scaled_reframed.shape}")
print(f"X_train_scaled_lstms2s_encoded.shape = {X_train_scaled_lstms2s_encoded.shape}")

print(f"X_test_scaled.shape = {X_test_scaled.shape}")
padded_before = np.tile(X_test_scaled[0], (conf["reduction"]["n_past"], 1))
padded_after = np.tile(X_test_scaled[-1], (conf["reduction"]["n_future"]-1, 1))
X_test_scaled_padded = np.vstack((padded_before, X_test_scaled, padded_after))
print(f"X_test_scaled_padded.shape = {X_test_scaled_padded.shape}")
X_test_scaled_reframed = split_sequences_no_label_with_future(X_test_scaled_padded, n_past=conf["reduction"]["n_past"], n_future=conf["reduction"]["n_future"])
X_test_scaled_lstms2s_encoded = encoder_model.predict(X_test_scaled_reframed)
print(f"X_test_scaled_reframed.shape = {X_test_scaled_reframed.shape}")
print(f"X_test_scaled_lstms2s_encoded.shape = {X_test_scaled_lstms2s_encoded.shape}")

/le_thanh_van_118/workspace/hiep_workspace/best_reduction_models/aod_lstms2s_dim_reduction_15_features_no_pm25_3km_encoder.keras


  saveable.load_own_variables(weights_store.get(inner_path))


None
X_train_scaled.shape = (5110, 18)
X_train_scaled_padded.shape = (5123, 18)


I0000 00:00:1753183285.323793  215105 service.cc:152] XLA service 0x2b748d90 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753183285.323841  215105 service.cc:160]   StreamExecutor device (0): Host, Default Version


[1m 51/160[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m0s[0m 3ms/step

I0000 00:00:1753183285.568470  215105 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m160/160[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step
X_train_scaled_reframed.shape = (5110, 7, 18)
X_train_scaled_lstms2s_encoded.shape = (5110, 15)
X_test_scaled.shape = (708100, 18)
X_test_scaled_padded.shape = (708113, 18)
[1m22129/22129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 3ms/step
X_test_scaled_reframed.shape = (708100, 7, 18)
X_test_scaled_lstms2s_encoded.shape = (708100, 15)


#### LSTM prediction

In [30]:
df_mpair_predicted = predictLSTM(X_train=X_train_scaled_lstms2s_encoded,
            y_train=y_train_scaled,
            X_test=X_test_scaled_lstms2s_encoded)

traindata.shape = (5110, 16)
padded_traindata.shape = (5116, 16)
X_train_reframed.shape = (5110, 7, 15)
    y_train_reframed.shape = (5110,)
Epoch 1/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 15ms/step - loss: 0.0885
Epoch 2/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0919
Epoch 3/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0600
Epoch 4/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0494
Epoch 5/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0637
Epoch 6/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - loss: 0.0491
Epoch 7/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0859
Epoch 8/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 0.0532
Epoch 9/20
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

#### Compare with other locations

In [31]:
list(df_mpair_train[["i", "j"]].drop_duplicates().itertuples(name=None, index=None))

[(12, 22), (15, 14), (15, 18), (16, 16), (16, 17), (17, 17), (18, 14)]

In [32]:
location_with_station = (15, 14)
nearest_location = (15, 16)
random_location = (34, 27)

df_location_with_station = df_mpair_train.loc[(df_mpair_train["i"] == location_with_station[0]) & (df_mpair_train["j"] == location_with_station[1]),
                                ["i", "j", "station", "pm25"]]
df_location_with_station

Unnamed: 0_level_0,i,j,station,pm25
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
2021-09-23,15,14,213.0,23.598236
...,...,...,...,...
2021-09-24,15,14,213.0,23.598236
2021-09-24,15,14,213.0,23.598236
2021-09-24,15,14,213.0,23.598236
2021-09-24,15,14,213.0,23.598236


In [33]:
df_nearest_location = df_mpair_predicted.loc[(df_mpair_predicted["i"] == nearest_location[0]) & (df_mpair_predicted["j"] == nearest_location[1])]
df_nearest_location

Unnamed: 0_level_0,pm25,i,j
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-09-24,19.950758,15,16
2021-09-24,19.964638,15,16
2021-09-24,19.968435,15,16
2021-09-24,19.973379,15,16
2021-09-24,19.983889,15,16
...,...,...,...
2021-09-25,20.079905,15,16
2021-09-25,20.052586,15,16
2021-09-25,20.014530,15,16
2021-09-25,19.979254,15,16


In [34]:
df_random_location = df_mpair_predicted.loc[(df_mpair_predicted["i"] == random_location[0]) & (df_mpair_predicted["j"] == random_location[1])]
df_random_location

Unnamed: 0_level_0,pm25,i,j
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-08-05,17.470886,34,27
2022-08-05,17.459743,34,27
2022-08-05,17.457630,34,27
2022-08-05,17.470694,34,27
2022-08-05,17.499365,34,27
...,...,...,...
2022-08-06,17.721069,34,27
2022-08-06,17.707243,34,27
2022-08-06,17.701658,34,27
2022-08-06,17.705217,34,27


In [35]:
print(f"location_with_station = {location_with_station}")
print(f"nearest_location = {nearest_location}")
print(f"random_location = {random_location}")
print(f"mae between location_with_station and random_location = {mean_absolute_error(df_location_with_station['pm25'], df_random_location['pm25'])}")
print(f"mae between location_with_station and nearest_location = {mean_absolute_error(df_location_with_station['pm25'], df_nearest_location['pm25'])}")

location_with_station = (15, 14)
nearest_location = (15, 16)
random_location = (34, 27)
mae between location_with_station and random_location = 6.888467041094175
mae between location_with_station and nearest_location = 5.188277431076142
