# SETUP

## Check environment

In [1]:
USE_GPU = True

In [2]:
try:
    import google.colab
    COLAB = True
except:
    COLAB = False
print(f"Using Google Colab: {COLAB}")

Using Google Colab: False


In [3]:
!pip install --upgrade pip tensorflow keras jupyterlab-vim jupyterlab-indent-guides maap-user-workspace-management-jupyter-extension pydot
!apt-get install graphviz -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
graphviz is already the newest version (2.42.2-6ubuntu0.1).
The following packages were automatically installed and are no longer required:
  gyp javascript-common libauthen-sasl-perl libc-ares2 libclone-perl
  libdata-dump-perl libencode-locale-perl libfile-basedir-perl
  libfile-desktopentry-perl libfile-listing-perl libfile-mimeinfo-perl
  libfont-afm-perl libfontenc1 libgtkd-3-0 libhtml-form-perl
  libhtml-format-perl libhtml-parser-perl libhtml-tagset-perl
  libhtml-tree-perl libhttp-cookies-perl libhttp-daemon-perl libhttp-date-perl
  libhttp-message-perl libhttp-negotiate-perl libio-html-perl
  libio-socket-ssl-perl libio-stringy-perl libipc-system-simple-perl
  libjs-events libjs-highlight.js libjs-inherits libjs-is-typedarray libjs-psl
  libjs-source-map libjs-sprintf-js libjs-typedarray-to-buffer libllvm11
  liblwp-mediatypes-perl liblwp-protocol-https-perl libmailtools-perl
  lib

## Libraries

In [4]:
# System
import os
import glob
import shutil
import copy
import re
from datetime import datetime
import logging
import sys

# Data
import numpy as np
import pandas as pd

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm

# Data processing
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Model
import tensorflow as tf
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from keras import Input, Model, Sequential
from keras.layers import Dense, LSTM, RepeatVector, TimeDistributed, Dropout, GRU, Conv1D, MaxPooling1D, Flatten
from keras.utils import plot_model
from keras.saving import load_model
from keras.callbacks import LearningRateScheduler, ModelCheckpoint
from keras.optimizers import Adam
from keras.losses import MeanAbsoluteError
from keras.losses import MeanAbsoluteError, MeanSquaredError
import keras.backend as K

# Append the custom libraries to system path
if COLAB:
    sys.path.append("/content/air_quality_index_project")

# Configure device
if not USE_GPU:
    print("Using CPU to train")
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
else:
    print("Using GPU to train")

# Custom libraries written by myself
from src.plot import plot_1_data, plot_2_data, plot_prediction
from src.reduction_model.lstm_s2s import LSTMSeq2SeqReduction
from src.reduction_model.gru_s2s import GRUSeq2SeqReduction
from src.reduction_model.cnnlstm_s2s import CNNLSTMSeq2SeqReduction
from src.prediction_model.lstm import LSTMPrediction
from src.loop_model import generate_loopresults, choose_the_best
from src.reduce_data_utils import prepareReducedData, augmentReducedData
from src.data_utils import mice

# Configuration reader
from src.config_reader import ConfigurationReader

# Checking Tensorflow
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print(tf.config.list_physical_devices('CPU'))

Using GPU to train
2.20.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


## Configuration

In [5]:
if COLAB:
    confReader = ConfigurationReader("/content/air_quality_index_project/model_params_colab.json")
else:
    confReader = ConfigurationReader("/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/model_params.json")
print(confReader)

conf = confReader.data

{
    "dataset": {
        "aod2022": {
            "file_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/MatchingData2022.xlsx",
            "target_start_date": "2022-01-01",
            "target_end_date": "2022-12-31"
        },
        "aod2021": {
            "file_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/aod_data_daily.csv",
            "target_start_date": "2021-01-01",
            "target_end_date": "2021-12-31"
        },
        "mpair": {
            "file_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/MPair.csv",
            "target_start_date": "2021-01-01",
            "target_end_date": "2022-12-31",
            "station_2022_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/station2022.csv",
            "station_2018_2021_dir": "/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/station2018-2021.c

# PM DATA

## Load 2022 dataset

In [46]:
df_pm2022_raw = []
for csv_file in sorted(glob.glob(f'{conf["dataset"]["pm2022"]["file_dir"]}/*')):
    df_current_station = pd.read_csv(csv_file)
    station = csv_file.split("/")[-1].split(".")[0]
    print(f"{csv_file} - {len(df_current_station)} records")
    df_current_station["station"] = station
    df_pm2022_raw.append(df_current_station)
df_pm2022_raw = pd.concat(df_pm2022_raw, axis=0, ignore_index=True)
df_pm2022_raw

/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2022/214.csv - 8760 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2022/215.csv - 4513 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2022/216.csv - 8761 records


Unnamed: 0.1,Unnamed: 0,TSP,PM2_5,O3,CO,NO2,SO2,date,station,Temperature,Humid
0,0,,,,,,,2022-01-01 00:00:00,214,,
1,1,51.768333,21.285000,44.916667,0.672500,0.036300,0.042500,2022-01-01 01:00:00,214,,
2,2,43.588889,18.414815,46.148148,0.577037,0.034759,0.035926,2022-01-01 02:00:00,214,,
3,3,43.988095,19.242857,46.357143,0.542143,0.035643,0.030238,2022-01-01 03:00:00,214,,
4,4,40.928333,18.140000,46.700000,0.507000,0.036550,0.029667,2022-01-01 04:00:00,214,,
...,...,...,...,...,...,...,...,...,...,...,...
22029,8756,126.168333,50.530000,66.116667,2.006500,0.083217,0.104167,2022-12-31 20:00:00,216,27.401667,68.026667
22030,8757,139.688333,55.055000,55.133333,2.418833,0.084300,0.130167,2022-12-31 21:00:00,216,27.603333,66.965000
22031,8758,63.565000,26.078333,46.750000,1.481500,0.066650,0.075833,2022-12-31 22:00:00,216,27.036667,63.361667
22032,8759,51.705000,21.161667,50.016667,1.301167,0.067783,0.060833,2022-12-31 23:00:00,216,26.358333,63.343333


In [47]:
# Convert "time" columns to Pandas datetime
df_pm2022_raw["date"] = pd.to_datetime(df_pm2022_raw["date"])

# Set "date" column as index
df_pm2022_raw.set_index("date", inplace=True)

# Lower case all column names
df_pm2022_raw = df_pm2022_raw.rename(columns={name: name.lower() for name in df_pm2022_raw.columns})

# Work with columns
df_pm2022_raw = df_pm2022_raw.drop(columns=["unnamed: 0"])
df_pm2022_raw = df_pm2022_raw.rename(columns={"pm2_5": "pm25"})

# Print
df_pm2022_raw

Unnamed: 0_level_0,tsp,pm25,o3,co,no2,so2,station,temperature,humid
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-01-01 00:00:00,,,,,,,214,,
2022-01-01 01:00:00,51.768333,21.285000,44.916667,0.672500,0.036300,0.042500,214,,
2022-01-01 02:00:00,43.588889,18.414815,46.148148,0.577037,0.034759,0.035926,214,,
2022-01-01 03:00:00,43.988095,19.242857,46.357143,0.542143,0.035643,0.030238,214,,
2022-01-01 04:00:00,40.928333,18.140000,46.700000,0.507000,0.036550,0.029667,214,,
...,...,...,...,...,...,...,...,...,...
2022-12-31 20:00:00,126.168333,50.530000,66.116667,2.006500,0.083217,0.104167,216,27.401667,68.026667
2022-12-31 21:00:00,139.688333,55.055000,55.133333,2.418833,0.084300,0.130167,216,27.603333,66.965000
2022-12-31 22:00:00,63.565000,26.078333,46.750000,1.481500,0.066650,0.075833,216,27.036667,63.361667
2022-12-31 23:00:00,51.705000,21.161667,50.016667,1.301167,0.067783,0.060833,216,26.358333,63.343333


In [48]:
sorted(df_pm2022_raw["station"].unique())

['214', '215', '216']

## Load 2021 dataset

In [49]:
df_pm2021_raw = []
for csv_file in sorted(glob.glob(f'{conf["dataset"]["pm2021"]["file_dir"]}/*')):
    df_current_station = pd.read_csv(csv_file)
    station = csv_file.split("/")[-1].split(".")[0]
    print(f"{csv_file} - {len(df_current_station)} records")
    df_current_station["station"] = station
    df_pm2021_raw.append(df_current_station.dropna(axis=1, how="all"))
df_pm2021_raw = pd.concat(df_pm2021_raw, axis=0, ignore_index=True)
df_pm2021_raw

/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2021/211.csv - 8166 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2021/212.csv - 8084 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2021/213.csv - 8084 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2021/214.csv - 8084 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2021/215.csv - 8084 records
/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/quan-trac/2021/216.csv - 8065 records


Unnamed: 0.1,Unnamed: 0,date,Station number,TSPug/m3,PM2.5ug/m3,O3ug/m3,CO (ug/m3),NO2 (ug/m3,Temperature,Humid,...,Temperature.1,Humid.1,dewPoint.1,humidity.1,pressure.1,uvIndex.1,ozone.1,ws.1,wd.1,station
0,0,2021-02-20 11:00:00,1,15.400000,9.200000,25.519000,1512.720000,28.230000,26.100000,60.500000,...,37.778333,99.135,78.81,0.97,1016.0,13.0,280.4,10.0,360.0,211
1,1,2021-02-20 12:00:00,1,,,,,,,,...,,,,,,,,,,211
2,2,2021-02-20 13:00:00,1,,,,,,,,...,,,,,,,,,,211
3,3,2021-02-20 14:00:00,1,,,,,,,,...,,,,,,,,,,211
4,4,2021-02-20 15:00:00,1,,,,,,,,...,,,,,,,,,,211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48562,8060,2022-01-26 12:00:00,6,127.290000,54.080000,144.771250,1344.831000,85.568267,32.703333,60.336667,...,,,,,,,,,,216
48563,8061,2022-01-26 13:00:00,6,120.145000,49.773333,134.629083,2105.011000,82.651167,34.370000,55.100000,...,,,,,,,,,,216
48564,8062,2022-01-26 14:00:00,6,112.141667,45.770000,146.832400,2391.511000,107.211267,33.045000,56.741667,...,,,,,,,,,,216
48565,8063,2022-01-26 15:00:00,6,135.530000,56.446667,240.958250,2418.442000,184.059600,30.943333,66.025000,...,,,,,,,,,,216


In [50]:
# Convert "time" columns to Pandas datetime
df_pm2021_raw['date'] = pd.to_datetime(df_pm2021_raw['date'])

# Rename columns
df_pm2021_raw = df_pm2021_raw.rename(columns={
    "TSPug/m3": "tsp", 
    "PM2.5ug/m3": "pm25", 
    "O3ug/m3": "o3", 
    "NO2 (ug/m3": "no2", 
    "CO (ug/m3)": "co",
})

# Set "date" column as index
df_pm2021_raw.set_index("date", inplace=True)

# Lower case all column names
df_pm2021_raw.rename(columns={name: name.lower() for name in df_pm2021_raw.columns}, inplace=True)

# Remove unnecessary columns
df_pm2021_raw.drop(columns=set(df_pm2021_raw.columns) - set(df_pm2022_raw.columns), inplace=True)

# Print
df_pm2021_raw

Unnamed: 0_level_0,tsp,pm25,o3,co,no2,temperature,humid,station
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-02-20 11:00:00,15.400000,9.200000,25.519000,1512.720000,28.230000,26.100000,60.500000,211
2021-02-20 12:00:00,,,,,,,,211
2021-02-20 13:00:00,,,,,,,,211
2021-02-20 14:00:00,,,,,,,,211
2021-02-20 15:00:00,,,,,,,,211
...,...,...,...,...,...,...,...,...
2022-01-26 12:00:00,127.290000,54.080000,144.771250,1344.831000,85.568267,32.703333,60.336667,216
2022-01-26 13:00:00,120.145000,49.773333,134.629083,2105.011000,82.651167,34.370000,55.100000,216
2022-01-26 14:00:00,112.141667,45.770000,146.832400,2391.511000,107.211267,33.045000,56.741667,216
2022-01-26 15:00:00,135.530000,56.446667,240.958250,2418.442000,184.059600,30.943333,66.025000,216


## Merge dataset

In [51]:
# Start merging
df_pm_raw = pd.concat([df_pm2021_raw, df_pm2022_raw], axis=0)

# Sort data by "station" then "time"
df_pm_raw.sort_values(by=["station", "date"], ascending=[True, True], inplace=True)

# Set index
df_pm_raw.set_index("date", inplace=True)

# Print
df_pm_raw

KeyError: "None of ['date'] are in the columns"

In [52]:
df_pm_raw.to_csv('df_pm_raw.csv', index=True)

## Statistics

In [53]:
df_aod_stat = df_aod_raw.describe()
df_aod_stat.to_csv(f'{conf["workspace"]["data_statistic_dir"]}/df_aod_stat.csv')

NameError: name 'df_aod_raw' is not defined

In [None]:
df_aod_raw.isnull().sum()