# Setup

In [1]:
USE_GPU = True

In [2]:
import os
import shutil
import sys
import copy

import pandas as pd
import numpy as np

# Append the custom libraries to system path
sys.path.append("/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project")

# Configure device
if not USE_GPU:
    print("Using CPU to train")
    os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
else:
    print("Using GPU to train")

from src.config_reader import ConfigurationReader
from src.data_utils import mice
from src.plot import plot_1_data

Using GPU to train


In [3]:
conf = ConfigurationReader("/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/model_params.json").data
print(conf)

# Clear all temp folders
def cleanDir(input_dir):
    if os.path.exists(input_dir):
        shutil.rmtree(input_dir)
    os.makedirs(input_dir)

cleanDir(conf["workspace"]["data_by_location_dir"])

{'dataset': {'aod2022': {'file_dir': '/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/MatchingData2022.xlsx', 'target_start_date': '2022-01-01', 'target_end_date': '2022-12-31'}, 'aod2021': {'file_dir': '/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/aod_data_daily.csv', 'target_start_date': '2021-01-01', 'target_end_date': '2021-12-31'}, 'mpair': {'file_dir': '/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/MPair.csv', 'target_start_date': '2021-01-01', 'target_end_date': '2022-12-31', 'station_2022_dir': '/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/station2022.csv', 'station_2018_2021_dir': '/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/station2018-2021.csv', 'merged_data_dir': '/le_thanh_van_118/workspace/hiep_workspace/air_quality_index_project/dataset/mpair_merged.csv', 'merged_data_dir_all_locations': '/le_thanh_van_118/workspace/

# Load dataset

## MPair.csv

### Read CSV

In [4]:
df_mpair_raw = pd.read_csv(conf["dataset"]["mpair"]["file_dir"])
df_mpair_raw

Unnamed: 0,i,j,lat,lon,time,PM25_3km,TMP,RH,HPBL,WSPD,...,NEAR_DIST,BARELAND,BUILTUP,CROPLAND,GRASSLAND,TREECOVER,WATER,NDVI,AOD,AIMODEL
0,0,0,11.149747,106.300443,2018-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,-9999.000000
1,0,1,11.149747,106.323330,2018-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,-9999.000000
2,0,2,11.149747,106.346217,2018-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,0.156327,-9999.000000
3,0,3,11.149747,106.369103,2018-01-01,24.740000,25.460614,65.144661,2.448070,2.448070,...,7.265194e+01,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.167487,-9999.000000
4,0,4,11.149747,106.391990,2018-01-01,23.760931,25.462688,65.080498,2.438784,2.438784,...,1.107789e+02,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827,,0.141881,-9999.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2236845,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,3.072120e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.000000,14.998473
2236846,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,4.355371e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.000000,15.201084
2236847,34,32,10.385233,107.032816,2022-12-31,-9999.000000,25.601173,-9999.000000,142.227615,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,0.000000
2236848,34,33,10.385233,107.055703,2022-12-31,-9999.000000,25.560753,-9999.000000,129.865280,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.000000,0.000000


In [5]:
df_mpair_raw.columns

Index(['i', 'j', 'lat', 'lon', 'time', 'PM25_3km', 'TMP', 'RH', 'HPBL', 'WSPD',
       'PRES2M', 'POP', 'ROAD_DEN_1km', 'ROAD_LEN_1km', 'PRIM_ROAD_LEN_1km',
       'NEAR_DIST', 'BARELAND', 'BUILTUP', 'CROPLAND', 'GRASSLAND',
       'TREECOVER', 'WATER', 'NDVI', 'AOD', 'AIMODEL'],
      dtype='object')

### Handle metadata

In [6]:
# Convert "time" columns to Pandas datetime
df_mpair_raw = df_mpair_raw.assign(time=pd.to_datetime(df_mpair_raw["time"]))

# Lower case all column names
df_mpair_raw = df_mpair_raw.rename(columns={name: name.lower() for name in df_mpair_raw.columns})

# Get data in 2021 and 2022 only
time_mask = ((df_mpair_raw["time"] >= conf["dataset"]["mpair"]["target_start_date"]) &
             (df_mpair_raw["time"] <= conf["dataset"]["mpair"]["target_end_date"]))
df_mpair_raw = df_mpair_raw.loc[time_mask]
df_mpair_raw.reset_index(inplace=True, drop=True)

# Set "time" column as index
#df_mpair_raw.set_index("time", inplace=True)

# Print
df_mpair_raw

Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,aimodel
0,0,0,11.149747,106.300443,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0,0.000000
1,0,1,11.149747,106.323330,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0,0.000000
2,0,2,11.149747,106.346217,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0,0.000000
3,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,7.265194e+01,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,-9999.0,18.526991
4,0,4,11.149747,106.391990,2021-01-01,25.666174,25.121210,62.159500,548.132568,5.052152,...,1.107789e+02,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827,,-9999.0,19.523800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894245,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,3.072120e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.0,14.998473
894246,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,4.355371e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.0,15.201084
894247,34,32,10.385233,107.032816,2022-12-31,-9999.000000,25.601173,-9999.000000,142.227615,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0,0.000000
894248,34,33,10.385233,107.055703,2022-12-31,-9999.000000,25.560753,-9999.000000,129.865280,-9999.000000,...,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0,0.000000


### Drop unnecessary columns

In [7]:
# Drop the columns because they are not in the matching data 2021 2022
# Use "python set" to check if the columns to drop are actually in the dataset
columns_to_drop = set(["aimodel", "pres2m", "road_len_1km"]) & set(df_mpair_raw.columns)
print(columns_to_drop)
df_mpair_raw = df_mpair_raw.drop(columns=columns_to_drop)
df_mpair_raw

{'pres2m', 'aimodel', 'road_len_1km'}


Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,prim_road_len_1km,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod
0,0,0,11.149747,106.300443,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
1,0,1,11.149747,106.323330,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
2,0,2,11.149747,106.346217,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
3,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,,7.265194e+01,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,-9999.0
4,0,4,11.149747,106.391990,2021-01-01,25.666174,25.121210,62.159500,548.132568,5.052152,...,,1.107789e+02,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827,,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894245,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,,3.072120e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.0
894246,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,,4.355371e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.0
894247,34,32,10.385233,107.032816,2022-12-31,-9999.000000,25.601173,-9999.000000,142.227615,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
894248,34,33,10.385233,107.055703,2022-12-31,-9999.000000,25.560753,-9999.000000,129.865280,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0


## station2018-2021.csv

### Read CSV

In [8]:
df_station_2021 = pd.read_csv(conf["dataset"]["mpair"]["station_2018_2021_dir"])
df_station_2021

Unnamed: 0,time,ID,Lon,Lat,pm25,i,j
0,1/1/2018 0:00,201,106.700606,10.783057,31.391304,16,17
1,1/1/2018 0:00,202,106.682027,10.762638,29.615625,17,17
2,1/1/2018 0:00,211,106.796200,10.870190,,12,22
3,1/1/2018 0:00,212,106.617000,10.740760,,18,14
4,1/1/2018 0:00,213,106.620500,10.816350,,15,14
...,...,...,...,...,...,...,...
11683,12/31/2021 0:00,212,106.617000,10.740760,20.100752,18,14
11684,12/31/2021 0:00,213,106.620500,10.816350,23.240539,15,14
11685,12/31/2021 0:00,214,106.717500,10.815570,20.549839,15,18
11686,12/31/2021 0:00,215,106.687900,10.776280,,17,17


### Handle metadata

In [9]:
# Convert "time" columns to Pandas datetime
df_station_2021 = df_station_2021.assign(time=pd.to_datetime(df_station_2021["time"]))

# Lower case all column names
df_station_2021 = df_station_2021.rename(columns={name: name.lower() for name in df_station_2021.columns})

# Rename "id" column to "station"
df_station_2021 = df_station_2021.rename(columns={"id": "station"})

# Get data in 2021 only
time_mask = ((df_station_2021["time"] >= conf["dataset"]["aod2021"]["target_start_date"]) & 
             (df_station_2021["time"] <= conf["dataset"]["aod2021"]["target_end_date"]))
df_station_2021 = df_station_2021.loc[time_mask]

# Sort data by station then time
df_station_2021.sort_values(by=["station", "time"], inplace=True, ignore_index=True)

# Print
df_station_2021

Unnamed: 0,time,station,lon,lat,pm25,i,j
0,2021-01-01,201,106.700606,10.783057,17.250000,16,17
1,2021-01-02,201,106.700606,10.783057,23.000000,16,17
2,2021-01-03,201,106.700606,10.783057,28.291667,16,17
3,2021-01-04,201,106.700606,10.783057,28.791667,16,17
4,2021-01-05,201,106.700606,10.783057,24.833333,16,17
...,...,...,...,...,...,...,...
2915,2021-12-27,216,106.659300,10.780630,35.099753,16,16
2916,2021-12-28,216,106.659300,10.780630,24.782436,16,16
2917,2021-12-29,216,106.659300,10.780630,25.165238,16,16
2918,2021-12-30,216,106.659300,10.780630,16.656319,16,16


### Check null

In [10]:
df_station_2021.isnull().sum()

time          0
station       0
lon           0
lat           0
pm25       1345
i             0
j             0
dtype: int64

In [11]:
for station in df_station_2021["station"].unique():
    df_current_station = df_station_2021[df_station_2021["station"] == station]
    display(df_current_station.isnull().sum())

time        0
station     0
lon         0
lat         0
pm25       16
i           0
j           0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       365
i            0
j            0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       192
i            0
j            0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       116
i            0
j            0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       139
i            0
j            0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       105
i            0
j            0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       260
i            0
j            0
dtype: int64

time         0
station      0
lon          0
lat          0
pm25       152
i            0
j            0
dtype: int64

In [12]:
'''
all_valid_stations_2021_dfs = []
for station in df_station_2021["station"].unique():
    df_current_station = df_station_2021.loc[df_station_2021["station"] == station]

    # Fill missing pm25
    #mean_pm25_current_station = df_current_station["pm25"].mean()
    #if mean_pm25_current_station >= 0:
    #    df_current_station.loc[:, "pm25"] = df_current_station["pm25"].fillna(value=df_current_station["pm25"].mean())
    #    all_valid_stations_2021_dfs.append(df_current_station)
    #print(f"station = {station}, mean_pm25 = {mean_pm25_current_station}")

# Combine all data
df_station_2021 = pd.concat(all_valid_stations_2021_dfs).reset_index()
df_station_2021
'''

'\nall_valid_stations_2021_dfs = []\nfor station in df_station_2021["station"].unique():\n    df_current_station = df_station_2021.loc[df_station_2021["station"] == station]\n\n    # Fill missing pm25\n    #mean_pm25_current_station = df_current_station["pm25"].mean()\n    #if mean_pm25_current_station >= 0:\n    #    df_current_station.loc[:, "pm25"] = df_current_station["pm25"].fillna(value=df_current_station["pm25"].mean())\n    #    all_valid_stations_2021_dfs.append(df_current_station)\n    #print(f"station = {station}, mean_pm25 = {mean_pm25_current_station}")\n\n# Combine all data\ndf_station_2021 = pd.concat(all_valid_stations_2021_dfs).reset_index()\ndf_station_2021\n'

### Filter out the invalid stations

In [13]:
# Filter out the invalid station
all_valid_dfs = []
for station in df_station_2021["station"].unique():
    df_current_station = df_station_2021.loc[df_station_2021["station"] == station]
    number_of_missing_values = df_current_station["pm25"].isnull().sum()
    if number_of_missing_values >= 365:
        print(f"station {station} have {df_current_station['pm25'].isnull().sum()} missing values, skipping")
    else:
        print(f"station {station} have {df_current_station['pm25'].isnull().sum()} missing values")
        all_valid_dfs.append(df_current_station)
df_station_2021 = pd.concat(all_valid_dfs)
df_station_2021

station 201 have 16 missing values
station 202 have 365 missing values, skipping
station 211 have 192 missing values
station 212 have 116 missing values
station 213 have 139 missing values
station 214 have 105 missing values
station 215 have 260 missing values
station 216 have 152 missing values


Unnamed: 0,time,station,lon,lat,pm25,i,j
0,2021-01-01,201,106.700606,10.783057,17.250000,16,17
1,2021-01-02,201,106.700606,10.783057,23.000000,16,17
2,2021-01-03,201,106.700606,10.783057,28.291667,16,17
3,2021-01-04,201,106.700606,10.783057,28.791667,16,17
4,2021-01-05,201,106.700606,10.783057,24.833333,16,17
...,...,...,...,...,...,...,...
2915,2021-12-27,216,106.659300,10.780630,35.099753,16,16
2916,2021-12-28,216,106.659300,10.780630,24.782436,16,16
2917,2021-12-29,216,106.659300,10.780630,25.165238,16,16
2918,2021-12-30,216,106.659300,10.780630,16.656319,16,16


### Get the location-station map

In [14]:
# Get the location i, j of each station
all_stations = [int(ele) for ele in df_station_2021["station"].unique()]
station_ij_map_2021 = {}
for station in all_stations:
    i = int(df_station_2021.loc[df_station_2021["station"] == station]["i"].unique()[0])
    j = int(df_station_2021.loc[df_station_2021["station"] == station]["j"].unique()[0])
    station_ij_map_2021[station] = (i, j)
station_ij_map_2021

{201: (16, 17),
 211: (12, 22),
 212: (18, 14),
 213: (15, 14),
 214: (15, 18),
 215: (17, 17),
 216: (16, 16)}

## station2022.csv

### Read CSV

In [15]:
df_station_2022 = pd.read_csv(conf["dataset"]["mpair"]["station_2022_dir"])
df_station_2022

Unnamed: 0,time,ID,lat,lon,pm25,i,j
0,2022-01-01,S4,10.815839,106.717396,23.768238,15,18
1,2022-01-01,S5,10.776415,106.687955,14.221014,17,17
2,2022-01-01,S6,10.780482,106.659511,21.405015,16,16
3,2022-01-01,S7,10.783456,106.700622,21.695652,16,17
4,2022-01-02,S4,10.815839,106.717396,22.154723,15,18
...,...,...,...,...,...,...,...
993,2022-12-30,S6,10.780482,106.659511,33.091428,16,16
994,2022-12-30,S7,10.783456,106.700622,40.166667,16,17
995,2022-12-31,S4,10.815839,106.717396,27.879261,15,18
996,2022-12-31,S6,10.780482,106.659511,28.854069,16,16


### Handle metadata

In [16]:
# Convert "time" columns to Pandas datetime
df_station_2022 = df_station_2022.assign(time=pd.to_datetime(df_station_2022["time"]))

# Lower case all column names
df_station_2022 = df_station_2022.rename(columns={name: name.lower() for name in df_station_2022.columns})

# Rename "id" column to "station"
df_station_2022 = df_station_2022.rename(columns={"id": "station"})

# Print
df_station_2022

Unnamed: 0,time,station,lat,lon,pm25,i,j
0,2022-01-01,S4,10.815839,106.717396,23.768238,15,18
1,2022-01-01,S5,10.776415,106.687955,14.221014,17,17
2,2022-01-01,S6,10.780482,106.659511,21.405015,16,16
3,2022-01-01,S7,10.783456,106.700622,21.695652,16,17
4,2022-01-02,S4,10.815839,106.717396,22.154723,15,18
...,...,...,...,...,...,...,...
993,2022-12-30,S6,10.780482,106.659511,33.091428,16,16
994,2022-12-30,S7,10.783456,106.700622,40.166667,16,17
995,2022-12-31,S4,10.815839,106.717396,27.879261,15,18
996,2022-12-31,S6,10.780482,106.659511,28.854069,16,16


### Remap the stations to align with 2021 data

In [17]:
# Get the location i, j of each station
all_stations = df_station_2022["station"].unique()

station_ij_map_2022 = {}
for station in all_stations:
    i = int(df_station_2022.loc[df_station_2022["station"] == station]["i"].unique()[0])
    j = int(df_station_2022.loc[df_station_2022["station"] == station]["j"].unique()[0])
    station_ij_map_2022[station] = (i, j)
print(station_ij_map_2022)

# Generate the replace map
to_replace_station_map = {}
for station_2022, location_2022 in station_ij_map_2022.items():
    for station_2021, location_2021 in station_ij_map_2021.items():
        if location_2021 == location_2022:
            to_replace_station_map[station_2022] = station_2021
print(to_replace_station_map)

# Replace
df_station_2022.loc[:, ["station"]] = df_station_2022["station"].replace(to_replace=to_replace_station_map)

# Sort data by station then time
df_station_2022.sort_values(by=["station", "time"], inplace=True)

df_station_2022

{'S4': (15, 18), 'S5': (17, 17), 'S6': (16, 16), 'S7': (16, 17)}
{'S4': 214, 'S5': 215, 'S6': 216, 'S7': 201}


  df_station_2022.loc[:, ["station"]] = df_station_2022["station"].replace(to_replace=to_replace_station_map)


Unnamed: 0,time,station,lat,lon,pm25,i,j
3,2022-01-01,201,10.783456,106.700622,21.695652,16,17
7,2022-01-02,201,10.783456,106.700622,24.583333,16,17
11,2022-01-03,201,10.783456,106.700622,31.083333,16,17
15,2022-01-04,201,10.783456,106.700622,50.916667,16,17
19,2022-01-05,201,10.783456,106.700622,33.041667,16,17
...,...,...,...,...,...,...,...
984,2022-12-27,216,10.780482,106.659511,39.659816,16,16
987,2022-12-28,216,10.780482,106.659511,44.679352,16,16
990,2022-12-29,216,10.780482,106.659511,30.724708,16,16
993,2022-12-30,216,10.780482,106.659511,33.091428,16,16


### Check null

In [18]:
df_station_2022.isnull().sum()

time       0
station    0
lat        0
lon        0
pm25       0
i          0
j          0
dtype: int64

In [19]:
correct_time_stamp = pd.date_range(conf["dataset"]["aod2022"]["target_start_date"], conf["dataset"]["aod2022"]["target_end_date"], freq="D")

all_reindex_dfs = []
# Padding missing data for each station
for station in df_station_2022["station"].unique():
    # Get current station
    df_current_station = df_station_2022.loc[df_station_2022["station"] == station]

    # Set "time" column as index first
    # Then reindex with the correct date range
    # This will fill in the missing timestamp
    df_current_station.set_index("time", inplace=True)
    df_current_station = df_current_station.reindex(correct_time_stamp)

    # Fill the static data
    for feature in ["station", "lat", "lon", "i", "j"]:
        df_current_station[feature] = df_current_station[feature].iloc[0]

    # Fill the pm25
    #df_current_station["pm25"] = df_current_station["pm25"].fillna(df_current_station["pm25"].mean())
    #df_current_station["pm25"] = mice(df_current_station["pm25"])

    # Append data
    print(f"station = {station}, count = {len(df_current_station)}, start_date = {df_current_station.index[0]}, end_date = {df_current_station.index[-1]}")
    all_reindex_dfs.append(df_current_station.reset_index(names="time"))

# Combine all data
df_station_2022 = pd.concat(all_reindex_dfs).reset_index(drop=True)
df_station_2022

station = 201, count = 365, start_date = 2022-01-01 00:00:00, end_date = 2022-12-31 00:00:00
station = 214, count = 365, start_date = 2022-01-01 00:00:00, end_date = 2022-12-31 00:00:00
station = 215, count = 365, start_date = 2022-01-01 00:00:00, end_date = 2022-12-31 00:00:00
station = 216, count = 365, start_date = 2022-01-01 00:00:00, end_date = 2022-12-31 00:00:00


Unnamed: 0,time,station,lat,lon,pm25,i,j
0,2022-01-01,201,10.783456,106.700622,21.695652,16.0,17.0
1,2022-01-02,201,10.783456,106.700622,24.583333,16.0,17.0
2,2022-01-03,201,10.783456,106.700622,31.083333,16.0,17.0
3,2022-01-04,201,10.783456,106.700622,50.916667,16.0,17.0
4,2022-01-05,201,10.783456,106.700622,33.041667,16.0,17.0
...,...,...,...,...,...,...,...
1455,2022-12-27,216,10.780482,106.659511,39.659816,16.0,16.0
1456,2022-12-28,216,10.780482,106.659511,44.679352,16.0,16.0
1457,2022-12-29,216,10.780482,106.659511,30.724708,16.0,16.0
1458,2022-12-30,216,10.780482,106.659511,33.091428,16.0,16.0


# Combine 2 station data

## Check stations for each year

In [20]:
display(df_station_2021["station"].unique())
display(df_station_2022["station"].unique())

array([201, 211, 212, 213, 214, 215, 216])

array([201, 214, 215, 216])

## Start combining

In [21]:
display(df_station_2021)
display(df_station_2022)

Unnamed: 0,time,station,lon,lat,pm25,i,j
0,2021-01-01,201,106.700606,10.783057,17.250000,16,17
1,2021-01-02,201,106.700606,10.783057,23.000000,16,17
2,2021-01-03,201,106.700606,10.783057,28.291667,16,17
3,2021-01-04,201,106.700606,10.783057,28.791667,16,17
4,2021-01-05,201,106.700606,10.783057,24.833333,16,17
...,...,...,...,...,...,...,...
2915,2021-12-27,216,106.659300,10.780630,35.099753,16,16
2916,2021-12-28,216,106.659300,10.780630,24.782436,16,16
2917,2021-12-29,216,106.659300,10.780630,25.165238,16,16
2918,2021-12-30,216,106.659300,10.780630,16.656319,16,16


Unnamed: 0,time,station,lat,lon,pm25,i,j
0,2022-01-01,201,10.783456,106.700622,21.695652,16.0,17.0
1,2022-01-02,201,10.783456,106.700622,24.583333,16.0,17.0
2,2022-01-03,201,10.783456,106.700622,31.083333,16.0,17.0
3,2022-01-04,201,10.783456,106.700622,50.916667,16.0,17.0
4,2022-01-05,201,10.783456,106.700622,33.041667,16.0,17.0
...,...,...,...,...,...,...,...
1455,2022-12-27,216,10.780482,106.659511,39.659816,16.0,16.0
1456,2022-12-28,216,10.780482,106.659511,44.679352,16.0,16.0
1457,2022-12-29,216,10.780482,106.659511,30.724708,16.0,16.0
1458,2022-12-30,216,10.780482,106.659511,33.091428,16.0,16.0


In [22]:
# Merge 2 station table
df_station = pd.concat([df_station_2021, df_station_2022])

# Sort by station then time
df_station.sort_values(by=["station", "time"], inplace=True, ignore_index=True)

# Store file for debugging
df_station.to_csv("df_station.csv", index=False)

df_station

Unnamed: 0,time,station,lon,lat,pm25,i,j
0,2021-01-01,201,106.700606,10.783057,17.250000,16.0,17.0
1,2021-01-02,201,106.700606,10.783057,23.000000,16.0,17.0
2,2021-01-03,201,106.700606,10.783057,28.291667,16.0,17.0
3,2021-01-04,201,106.700606,10.783057,28.791667,16.0,17.0
4,2021-01-05,201,106.700606,10.783057,24.833333,16.0,17.0
...,...,...,...,...,...,...,...
4010,2022-12-27,216,106.659511,10.780482,39.659816,16.0,16.0
4011,2022-12-28,216,106.659511,10.780482,44.679352,16.0,16.0
4012,2022-12-29,216,106.659511,10.780482,30.724708,16.0,16.0
4013,2022-12-30,216,106.659511,10.780482,33.091428,16.0,16.0


## Check null

In [23]:
df_station.isnull().sum()

time          0
station       0
lon           0
lat           0
pm25       1442
i             0
j             0
dtype: int64

## Get the final station-location map

In [24]:
{station: (i, j) for station, i, j in list(df_station.loc[:, ["station", "i", "j"]].drop_duplicates().itertuples(index=False, name=None))}

{201: (16.0, 17.0),
 211: (12.0, 22.0),
 212: (18.0, 14.0),
 213: (15.0, 14.0),
 214: (15.0, 18.0),
 215: (17.0, 17.0),
 216: (16.0, 16.0)}

# Merge MPair data with station data (inner join)

## Start merging

In [25]:
df_mpair = df_mpair_raw.merge(df_station, how="inner", on=["time", "i", "j"])
df_mpair

Unnamed: 0,i,j,lat_x,lon_x,time,pm25_3km,tmp,rh,hpbl,wspd,...,cropland,grassland,treecover,water,ndvi,aod,station,lon_y,lat_y,pm25
0,12,22,10.879919,106.803950,2021-01-01,30.366625,25.749989,61.403439,533.919250,3.311250,...,20562.897823,151361.367404,282947.325347,39120.320073,,-9999.0,211,106.796200,10.870190,
1,15,14,10.812461,106.620857,2021-01-01,36.621094,25.925655,58.359192,536.831421,4.195177,...,20717.659283,58947.442429,108246.100356,492.146554,2580.669322,-9999.0,213,106.620500,10.816350,
2,15,18,10.812461,106.712403,2021-01-01,36.649185,26.025026,59.134865,579.871704,3.491554,...,6083.319648,17783.771810,95404.824438,127492.687793,2042.834978,-9999.0,214,106.717500,10.815570,
3,16,16,10.789976,106.666630,2021-01-01,39.352276,25.633114,61.730026,479.575165,4.054595,...,28.761909,7179.267694,48393.446780,12262.365566,1686.406661,-9999.0,216,106.659300,10.780630,
4,16,17,10.789976,106.689516,2021-01-01,38.528156,25.759054,61.073494,520.164246,3.809916,...,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,-9999.0,201,106.700606,10.783057,17.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4010,17,17,10.767490,106.689516,2022-12-30,19.083696,25.032667,64.868401,669.416260,3.893373,...,229.889184,2906.313289,68310.042059,46929.703983,1876.516122,-9999.0,215,106.687955,10.776415,
4011,15,18,10.812461,106.712403,2022-12-31,28.247709,26.630608,55.478946,607.372803,4.216505,...,6083.319648,17783.771810,95404.824438,127492.687793,2042.834978,-9999.0,214,106.717396,10.815839,27.879261
4012,16,16,10.789976,106.666630,2022-12-31,27.739468,25.420719,59.543566,783.973083,4.161581,...,28.761909,7179.267694,48393.446780,12262.365566,1686.406661,-9999.0,216,106.659511,10.780482,28.854069
4013,16,17,10.789976,106.689516,2022-12-31,26.935106,25.481535,59.598768,778.749390,4.166253,...,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,-9999.0,201,106.700622,10.783456,33.958333


In [26]:
df_mpair.isnull().sum()

i                       0
j                       0
lat_x                   0
lon_x                   0
time                    0
pm25_3km                0
tmp                     0
rh                      0
hpbl                    0
wspd                    0
pop                     0
road_den_1km            0
prim_road_len_1km    2555
near_dist               0
bareland                0
builtup                 0
cropland                0
grassland               0
treecover               0
water                   0
ndvi                  365
aod                     7
station                 0
lon_y                   0
lat_y                   0
pm25                 1442
dtype: int64

## Handle metadata

In [27]:
# Drop redundant columns
df_mpair.drop(columns=["lat_y", "lon_y"], axis=1, inplace=True)

# Rename "lat_x" to "lat", "lon_x" to "lon"
df_mpair.rename(columns={"lat_x": "lat", "lon_x": "lon"}, inplace=True)

# Sort data by "station" then "time"
df_mpair.sort_values(by=["station", "time"], inplace=True)

df_mpair

Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
4,16,17,10.789976,106.689516,2021-01-01,38.528156,25.759054,61.073494,520.164246,3.809916,...,1056.256477,878404.088663,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,-9999.000000,201,17.250000
11,16,17,10.789976,106.689516,2021-01-02,57.013439,25.378061,61.142441,524.404724,2.527972,...,1056.256477,878404.088663,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,0.317342,201,23.000000
18,16,17,10.789976,106.689516,2021-01-03,57.202629,26.691256,62.528309,467.049805,1.928542,...,1056.256477,878404.088663,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,-9999.000000,201,28.291667
25,16,17,10.789976,106.689516,2021-01-04,61.890816,27.549019,62.431583,600.658997,1.653818,...,1056.256477,878404.088663,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,-9999.000000,201,28.791667
32,16,17,10.789976,106.689516,2021-01-05,70.329758,27.360172,63.041634,654.842773,2.941231,...,1056.256477,878404.088663,9.037443,1221.246398,86858.898791,32195.290892,1830.941629,-9999.000000,201,24.833333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3996,16,16,10.789976,106.666630,2022-12-27,34.151028,24.213715,69.134701,463.417114,2.020892,...,2272.289336,929982.248561,28.761909,7179.267694,48393.446780,12262.365566,1686.406661,-9999.000000,216,39.659816
4000,16,16,10.789976,106.666630,2022-12-28,29.592785,25.896009,62.308920,647.989014,2.376687,...,2272.289336,929982.248561,28.761909,7179.267694,48393.446780,12262.365566,1686.406661,-9999.000000,216,44.679352
4004,16,16,10.789976,106.666630,2022-12-29,27.141323,25.894251,63.947048,813.575928,2.991689,...,2272.289336,929982.248561,28.761909,7179.267694,48393.446780,12262.365566,1686.406661,0.193722,216,30.724708
4008,16,16,10.789976,106.666630,2022-12-30,19.889748,25.035917,61.037365,668.597046,4.194347,...,2272.289336,929982.248561,28.761909,7179.267694,48393.446780,12262.365566,1686.406661,-9999.000000,216,33.091428


In [28]:
df_mpair.isnull().sum()

i                       0
j                       0
lat                     0
lon                     0
time                    0
pm25_3km                0
tmp                     0
rh                      0
hpbl                    0
wspd                    0
pop                     0
road_den_1km            0
prim_road_len_1km    2555
near_dist               0
bareland                0
builtup                 0
cropland                0
grassland               0
treecover               0
water                   0
ndvi                  365
aod                     7
station                 0
pm25                 1442
dtype: int64

## Export data

In [29]:
df_mpair.to_csv(conf["dataset"]["mpair"]["merged_data_dir"], index=False)

# Merge MPair data with station data, keep all locations

## Filter out the invalid location

In [30]:
df_mpair_raw

Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,prim_road_len_1km,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod
0,0,0,11.149747,106.300443,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
1,0,1,11.149747,106.323330,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
2,0,2,11.149747,106.346217,2021-01-01,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
3,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,,7.265194e+01,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,-9999.0
4,0,4,11.149747,106.391990,2021-01-01,25.666174,25.121210,62.159500,548.132568,5.052152,...,,1.107789e+02,4569.189136,21119.949784,76052.614180,231046.865089,662859.174184,4308.164827,,-9999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894245,34,30,10.385233,106.987043,2022-12-31,,25.815809,74.737156,207.873199,3.124345,...,,3.072120e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.0
894246,34,31,10.385233,107.009930,2022-12-31,,25.815809,75.000397,207.873199,3.215876,...,,4.355371e+03,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,,-9999.0
894247,34,32,10.385233,107.032816,2022-12-31,-9999.000000,25.601173,-9999.000000,142.227615,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0
894248,34,33,10.385233,107.055703,2022-12-31,-9999.000000,25.560753,-9999.000000,129.865280,-9999.000000,...,0.0,-1.797693e+308,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.000000,-9999.0,-9999.0


In [31]:
# Get the start and end of i, j
mpair_start_i, mpair_end_i = sorted(df_mpair_raw["i"].unique())[0], sorted(df_mpair_raw["i"].unique())[-1]
mpair_start_j, mpair_end_j = sorted(df_mpair_raw["j"].unique())[0], sorted(df_mpair_raw["j"].unique())[-1]
print(mpair_start_i, mpair_end_i, mpair_start_j, mpair_end_j)

# Loop over every location
all_valid_location_dfs = []
for i in range(mpair_start_i, mpair_end_i + 1):
    for j in range(mpair_start_j, mpair_end_j + 1):
        # Get the data for current location
        df_mpair_current_ij = df_mpair_raw.loc[((df_mpair_raw["i"] == i) & (df_mpair_raw["j"] == j))]

        # Replace the invalid value with NaN
        df_mpair_current_ij = df_mpair_current_ij.replace(-9999, np.nan)

        # Filter out values based on pm25_3km
        total_missing = df_mpair_current_ij["pm25_3km"].isnull().sum()
        #missing_threshold = 1000
        missing_threshold = 365 # strictly
        if total_missing >= missing_threshold:
            df_mpair_current_ij.to_csv(os.path.join(conf["workspace"]["data_by_location_dir"], f"df_mpair_current_{i}_{j}_invalid.csv"), index=False)
            print(f"Location: {i}, {j} - missing = {total_missing} => invalid, skipping...")
        else:
            df_mpair_current_ij.to_csv(os.path.join(conf["workspace"]["data_by_location_dir"], f"df_mpair_current_{i}_{j}.csv"), index=False)
            all_valid_location_dfs += [df_mpair_current_ij]

# Combine all valid location data
df_mpair_raw_valid_locations = pd.concat(all_valid_location_dfs, ignore_index=True)

0 34 0 34
Location: 0, 0 - missing = 730 => invalid, skipping...
Location: 0, 1 - missing = 730 => invalid, skipping...
Location: 0, 2 - missing = 730 => invalid, skipping...
Location: 0, 32 - missing = 730 => invalid, skipping...
Location: 0, 33 - missing = 730 => invalid, skipping...
Location: 0, 34 - missing = 730 => invalid, skipping...
Location: 1, 0 - missing = 730 => invalid, skipping...
Location: 1, 1 - missing = 730 => invalid, skipping...
Location: 1, 2 - missing = 730 => invalid, skipping...
Location: 1, 32 - missing = 730 => invalid, skipping...
Location: 1, 33 - missing = 730 => invalid, skipping...
Location: 1, 34 - missing = 730 => invalid, skipping...
Location: 2, 0 - missing = 730 => invalid, skipping...
Location: 2, 1 - missing = 730 => invalid, skipping...
Location: 2, 2 - missing = 730 => invalid, skipping...
Location: 2, 32 - missing = 730 => invalid, skipping...
Location: 2, 33 - missing = 730 => invalid, skipping...
Location: 2, 34 - missing = 730 => invalid, ski

In [32]:
df_mpair_raw_valid_locations

Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,prim_road_len_1km,near_dist,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod
0,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,
1,0,3,11.149747,106.369103,2021-01-02,39.470001,24.043423,69.082085,485.584290,3.288929,...,,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,
2,0,3,11.149747,106.369103,2021-01-03,40.619999,25.058735,67.647362,403.936310,2.731468,...,,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,
3,0,3,11.149747,106.369103,2021-01-04,40.330002,25.554197,71.023094,704.476807,3.522596,...,,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.555355
4,0,3,11.149747,106.369103,2021-01-05,50.009998,25.037054,74.029236,534.426575,3.223711,...,,72.651942,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713205,34,27,10.385233,106.918383,2022-12-27,19.874584,24.644707,76.055573,441.420624,2.561953,...,,365.912235,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,
713206,34,27,10.385233,106.918383,2022-12-28,17.509365,25.071791,78.165489,403.261658,2.126233,...,,365.912235,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,
713207,34,27,10.385233,106.918383,2022-12-29,18.428423,25.458431,79.421234,425.696411,1.909332,...,,365.912235,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,0.174487
713208,34,27,10.385233,106.918383,2022-12-30,12.645213,24.636055,76.198395,241.369736,2.530340,...,,365.912235,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,


In [33]:
df_mpair_raw_valid_locations.isnull().sum()

i                         0
j                         0
lat                       0
lon                       0
time                      0
pm25_3km                  0
tmp                       0
rh                        0
hpbl                      0
wspd                      0
pop                       0
road_den_1km              0
prim_road_len_1km    708830
near_dist                 0
bareland                  0
builtup                   0
cropland                  0
grassland                 0
treecover                 0
water                     0
ndvi                 556260
aod                  429477
dtype: int64

## Start merging

In [34]:
df_mpair_valid_locations = pd.merge(df_mpair_raw_valid_locations, df_station, how="left", on=["i", "j", "time"])
df_mpair_valid_locations

Unnamed: 0,i,j,lat_x,lon_x,time,pm25_3km,tmp,rh,hpbl,wspd,...,cropland,grassland,treecover,water,ndvi,aod,station,lon_y,lat_y,pm25
0,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,54101.838731,203551.710735,683877.574149,13604.234765,,,,,,
1,0,3,11.149747,106.369103,2021-01-02,39.470001,24.043423,69.082085,485.584290,3.288929,...,54101.838731,203551.710735,683877.574149,13604.234765,,,,,,
2,0,3,11.149747,106.369103,2021-01-03,40.619999,25.058735,67.647362,403.936310,2.731468,...,54101.838731,203551.710735,683877.574149,13604.234765,,,,,,
3,0,3,11.149747,106.369103,2021-01-04,40.330002,25.554197,71.023094,704.476807,3.522596,...,54101.838731,203551.710735,683877.574149,13604.234765,,0.555355,,,,
4,0,3,11.149747,106.369103,2021-01-05,50.009998,25.037054,74.029236,534.426575,3.223711,...,54101.838731,203551.710735,683877.574149,13604.234765,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713205,34,27,10.385233,106.918383,2022-12-27,19.874584,24.644707,76.055573,441.420624,2.561953,...,8711.125789,154921.372745,91390.837123,426893.893429,,,,,,
713206,34,27,10.385233,106.918383,2022-12-28,17.509365,25.071791,78.165489,403.261658,2.126233,...,8711.125789,154921.372745,91390.837123,426893.893429,,,,,,
713207,34,27,10.385233,106.918383,2022-12-29,18.428423,25.458431,79.421234,425.696411,1.909332,...,8711.125789,154921.372745,91390.837123,426893.893429,,0.174487,,,,
713208,34,27,10.385233,106.918383,2022-12-30,12.645213,24.636055,76.198395,241.369736,2.530340,...,8711.125789,154921.372745,91390.837123,426893.893429,,,,,,


## Handle metadata

In [35]:
# Drop redundant columns
df_mpair_valid_locations.drop(columns=["lat_y", "lon_y"], axis=1, inplace=True)

# Rename "lat_x" to "lat", "lon_x" to "lon"
df_mpair_valid_locations.rename(columns={"lat_x": "lat", "lon_x": "lon"}, inplace=True)

# Sort data by location then "time"
df_mpair_valid_locations.sort_values(by=["i", "j", "time"], inplace=True)

df_mpair_valid_locations

Unnamed: 0,i,j,lat,lon,time,pm25_3km,tmp,rh,hpbl,wspd,...,bareland,builtup,cropland,grassland,treecover,water,ndvi,aod,station,pm25
0,0,3,11.149747,106.369103,2021-01-01,26.040001,25.114687,63.633778,567.478943,4.660580,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
1,0,3,11.149747,106.369103,2021-01-02,39.470001,24.043423,69.082085,485.584290,3.288929,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
2,0,3,11.149747,106.369103,2021-01-03,40.619999,25.058735,67.647362,403.936310,2.731468,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
3,0,3,11.149747,106.369103,2021-01-04,40.330002,25.554197,71.023094,704.476807,3.522596,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,0.555355,,
4,0,3,11.149747,106.369103,2021-01-05,50.009998,25.037054,74.029236,534.426575,3.223711,...,14453.144036,30347.516088,54101.838731,203551.710735,683877.574149,13604.234765,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
713205,34,27,10.385233,106.918383,2022-12-27,19.874584,24.644707,76.055573,441.420624,2.561953,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
713206,34,27,10.385233,106.918383,2022-12-28,17.509365,25.071791,78.165489,403.261658,2.126233,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,
713207,34,27,10.385233,106.918383,2022-12-29,18.428423,25.458431,79.421234,425.696411,1.909332,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,0.174487,,
713208,34,27,10.385233,106.918383,2022-12-30,12.645213,24.636055,76.198395,241.369736,2.530340,...,24482.287914,48934.328055,8711.125789,154921.372745,91390.837123,426893.893429,,,,


## Export data

In [36]:
df_mpair_valid_locations.to_csv(conf["dataset"]["mpair"]["merged_data_dir_all_locations"], index=False)