## Dataset Preprocessing Notebook

#### Load request modules

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import os
from urllib.parse import urlparse
from os.path import basename
from rich.progress import track

#### Load weather and air quality data

In [3]:
# download file from cloud storage

base_dir = '../data'
chunk_size = 10_240
urls = ['https://pub-6924d15baafa48c6a31c2d212b094174.r2.dev/environment/air_quality-13t20_97t106_20240910_20241211.csv.bz2',
        'https://pub-6924d15baafa48c6a31c2d212b094174.r2.dev/environment/weather-13t20_97t106_20240910_20241211.csv.bz2']

for url in urls:
    file = basename(url)
    parse_url = urlparse(url)
    print(f'Getting file {file}')
    with requests.get(url, stream=True) as res:
        if res.ok:
            size = int(res.headers.get('Content-Length'))
            with open(f'{base_dir}{parse_url.path}', 'wb') as f:
                for chuck in track(res.iter_content(chunk_size), total=size/chunk_size, description='Download...'):
                    f.write(chuck)
        else:
            print(f'Error getting file!!!')
print('Done...')

Output()

Getting file air_quality-13t20_97t106_20240910_20241211.csv.bz2


Getting file weather-13t20_97t106_20240910_20241211.csv.bz2


Output()

Done...


In [4]:
weather_df = pd.read_csv('../data/environment/weather-13t20_97t106_20240910_20241211.csv.bz2')
air_quality_df = pd.read_csv('../data/environment/air_quality-13t20_97t106_20240910_20241211.csv.bz2')

#### Clean data

##### Process Weather Dataset

In [7]:
# check weather for null columns
weather_df.isnull().sum()

date                         0
latitude                     0
longitude                    0
temperature_2m           30096
relative_humidity_2m     30096
dew_point_2m             30096
pressure_msl             30096
surface_pressure         30096
visibility              363888
wind_speed_10m           30096
wind_speed_80m           30096
wind_speed_100m         363888
wind_speed_120m          30096
wind_speed_180m          30096
wind_direction_10m       30096
wind_direction_80m       30096
wind_direction_100m     363888
wind_direction_120m      30096
wind_direction_180m      30096
temperature_80m          30096
temperature_120m         30096
temperature_180m         30096
type                         0
dtype: int64

In [8]:
# drop unused column
weather_df.drop(columns=['type', 'visibility', 'wind_speed_100m', 
                         'wind_direction_100m'], inplace=True, errors='ignore')
# delete column with null temperature
weather_df.dropna(subset=['temperature_2m'], inplace=True)

# convert column date to datetime
weather_df['date'] = pd.to_datetime(weather_df['date'], utc=True, format='ISO8601')

# check weather for null columns
weather_df.isnull().sum()

date                    0
latitude                0
longitude               0
temperature_2m          0
relative_humidity_2m    0
dew_point_2m            0
pressure_msl            0
surface_pressure        0
wind_speed_10m          0
wind_speed_80m          0
wind_speed_120m         0
wind_speed_180m         0
wind_direction_10m      0
wind_direction_80m      0
wind_direction_120m     0
wind_direction_180m     0
temperature_80m         0
temperature_120m        0
temperature_180m        0
dtype: int64

##### Process Air Quality Dataset

In [10]:
# check air quality for null columns
air_quality_df.isnull().sum()

date                      0
latitude                  0
longitude                 0
pm10                      0
pm2_5                     0
carbon_monoxide           0
carbon_dioxide      3020544
nitrogen_dioxide          0
sulphur_dioxide           0
ozone                     0
dust                      0
uv_index                  0
type                      0
dtype: int64

In [11]:
# drop unused column
air_quality_df.drop(columns=['type','carbon_dioxide'], inplace=True, errors='ignore')

# convert column date to datetime
air_quality_df['date'] = pd.to_datetime(air_quality_df['date'], utc=True, format='ISO8601')

In [12]:
# check air quality for null columns
air_quality_df.isnull().sum()

date                0
latitude            0
longitude           0
pm10                0
pm2_5               0
carbon_monoxide     0
nitrogen_dioxide    0
sulphur_dioxide     0
ozone               0
dust                0
uv_index            0
dtype: int64

In [13]:
# review data date range for weather
weather_df['date'].describe()

count                                6076656
mean     2024-10-26 17:00:00.000001024+00:00
min                2024-09-10 11:00:00+00:00
25%                2024-10-03 14:00:00+00:00
50%                2024-10-26 17:00:00+00:00
75%                2024-11-18 20:00:00+00:00
max                2024-12-11 23:00:00+00:00
Name: date, dtype: object

In [14]:
# review data date range for air quality
air_quality_df['date'].describe()

count                                6106752
mean     2024-10-26 11:29:59.999999232+00:00
min                2024-09-10 00:00:00+00:00
25%                2024-10-03 05:45:00+00:00
50%                2024-10-26 11:30:00+00:00
75%                2024-11-18 17:15:00+00:00
max                2024-12-11 23:00:00+00:00
Name: date, dtype: object

In [15]:
# filter data date range to the same as both
weather_df = weather_df.loc[(weather_df['date'] >= '2024-09-10 11:00:00+00:00') & \
                                    (weather_df['date'] <= '2024-12-05 11:00:00+00:00')]

In [16]:
# filter air quality data date range as same as
air_quality_df = air_quality_df.loc[(air_quality_df['date'] >= '2024-09-10 11:00:00+00:00') & \
                                    (air_quality_df['date'] <= '2024-12-05 11:00:00+00:00')]

In [17]:
weather_df1 = weather_df.copy()
weather_df1.set_index(['date', 'latitude', 'longitude'], drop=True, inplace=True)

air_quality_df1 = air_quality_df.copy()
air_quality_df1.set_index(['date', 'latitude', 'longitude'], drop=True, inplace=True)

# combine weather and air quality
env_df = pd.concat([weather_df1, air_quality_df1], axis=1)

In [18]:
# show sample combined data
env_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,temperature_2m,relative_humidity_2m,dew_point_2m,pressure_msl,surface_pressure,wind_speed_10m,wind_speed_80m,wind_speed_120m,wind_speed_180m,wind_direction_10m,...,temperature_120m,temperature_180m,pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone,dust,uv_index
date,latitude,longitude,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2024-09-10 11:00:00+00:00,13.3,97.1,27.75,88.0,25.579264,1006.6,1006.6,21.175835,24.248613,24.933735,25.729298,252.18102,...,26.1,25.3,32.5,15.7,106.0,0.1,0.2,42.0,8.0,0.05
2024-09-10 11:00:00+00:00,13.3,97.25,27.7,88.0,25.530062,1006.7,1006.7,20.49156,23.452555,23.795427,24.1388,251.56496,...,26.0,25.2,32.5,15.7,106.0,0.1,0.2,42.0,8.0,0.05
2024-09-10 11:00:00+00:00,13.3,97.4,27.7,87.0,25.337757,1006.7,1006.7,20.49156,22.768398,23.565567,23.906818,251.56496,...,26.0,25.2,31.0,15.1,107.0,0.1,0.2,41.0,7.0,0.05
2024-09-10 11:00:00+00:00,13.3,97.55,27.7,87.0,25.337757,1006.8,1006.8,19.694992,21.9718,22.314194,23.110205,251.89613,...,26.0,25.2,31.0,15.1,107.0,0.1,0.2,41.0,7.0,0.05
2024-09-10 11:00:00+00:00,13.3,97.7,27.7,87.0,25.337757,1006.8,1006.8,18.671474,20.833395,21.288757,21.629978,250.86626,...,26.0,25.2,31.0,15.1,107.0,0.1,0.2,41.0,7.0,0.05


In [19]:
# save to parquet format
env_df.to_parquet('../data/environment/combine_data-20240911_20241211.parquet')