In [1]:
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.subplots as sp

In [2]:
# Load the dataset
df = pd.read_parquet('air_quality_data.parquet')

In [3]:
columns = df.columns
print(columns)

Index(['lon', 'time', 'lat', 'DUEXTTAU', 'BCFLUXU', 'OCFLUXV', 'BCANGSTR',
       'SUFLUXV', 'SSSMASS25', 'SSSMASS', 'OCSMASS', 'BCCMASS', 'BCSMASS',
       'SO4CMASS', 'SSFLUXU', 'DUCMASS', 'SSEXTTAU', 'SO2CMASS', 'DUSCATAU',
       'OCANGSTR', 'OCCMASS', 'TOTEXTTAU', 'DUSCAT25', 'TOTANGSTR', 'DMSCMASS',
       'SSEXTT25', 'DUANGSTR', 'DMSSMASS', 'BCEXTTAU', 'SSSCATAU', 'DUFLUXV',
       'DUFLUXU', 'SUEXTTAU', 'SSFLUXV', 'BCSCATAU', 'DUCMASS25', 'OCEXTTAU',
       'SUANGSTR', 'SSSCAT25', 'SSCMASS25', 'SO4SMASS', 'DUSMASS', 'SUFLUXU',
       'BCFLUXV', 'DUSMASS25', 'SSCMASS', 'SUSCATAU', 'SO2SMASS', 'SSANGSTR',
       'DUEXTT25', 'OCFLUXU', 'OCSCATAU', 'TOTSCATAU', 'source_file',
       'PM25_MERRA2', 'PM25_ug_m3', 'class'],
      dtype='object')


In [4]:
lat_min, lat_max = df['lat'].min(), df['lat'].max()
lon_min, lon_max = df['lon'].min(), df['lon'].max()

In [5]:
grid_size = 20
lat_bins = np.linspace(lat_min, lat_max, grid_size + 1)
lon_bins = np.linspace(lon_min, lon_max, grid_size + 1)

# Assign grid positions
lat_positions = pd.cut(df['lat'], bins=lat_bins, labels=False, include_lowest=True)
lon_positions = pd.cut(df['lon'], bins=lon_bins, labels=False, include_lowest=True)

# Create location ID (0-399 for 20x20 grid)
df['location'] = lat_positions * grid_size + lon_positions

# Ensure location is integer and handle any NaN values
df['location'] = df['location'].fillna(0).astype(int)

In [6]:
df['class'] = df['class'].isin(['Good', 'Moderate']).astype(int)

In [7]:
# convert to datetime
df['time'] = pd.to_datetime(df['time'])

# extract parts
df['year'] = df['time'].dt.year
df['month'] = df['time'].dt.month
df['day'] = df['time'].dt.day
df['hour'] = df['time'].dt.hour


In [8]:
#df = df[df['year'] == 2025]

In [9]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14006400 entries, 0 to 14006399
Data columns (total 62 columns):
 #   Column       Dtype         
---  ------       -----         
 0   lon          float32       
 1   time         datetime64[ns]
 2   lat          float32       
 3   DUEXTTAU     float32       
 4   BCFLUXU      float32       
 5   OCFLUXV      float32       
 6   BCANGSTR     float32       
 7   SUFLUXV      float32       
 8   SSSMASS25    float32       
 9   SSSMASS      float32       
 10  OCSMASS      float32       
 11  BCCMASS      float32       
 12  BCSMASS      float32       
 13  SO4CMASS     float32       
 14  SSFLUXU      float32       
 15  DUCMASS      float32       
 16  SSEXTTAU     float32       
 17  SO2CMASS     float32       
 18  DUSCATAU     float32       
 19  OCANGSTR     float32       
 20  OCCMASS      float32       
 21  TOTEXTTAU    float32       
 22  DUSCAT25     float32       
 23  TOTANGSTR    float32       
 24  DMSCMASS     float32  

In [10]:
df.drop(columns=['lat', 'lon', 'source_file', 'time'], inplace=True)

In [11]:
print(df.columns)

Index(['DUEXTTAU', 'BCFLUXU', 'OCFLUXV', 'BCANGSTR', 'SUFLUXV', 'SSSMASS25',
       'SSSMASS', 'OCSMASS', 'BCCMASS', 'BCSMASS', 'SO4CMASS', 'SSFLUXU',
       'DUCMASS', 'SSEXTTAU', 'SO2CMASS', 'DUSCATAU', 'OCANGSTR', 'OCCMASS',
       'TOTEXTTAU', 'DUSCAT25', 'TOTANGSTR', 'DMSCMASS', 'SSEXTT25',
       'DUANGSTR', 'DMSSMASS', 'BCEXTTAU', 'SSSCATAU', 'DUFLUXV', 'DUFLUXU',
       'SUEXTTAU', 'SSFLUXV', 'BCSCATAU', 'DUCMASS25', 'OCEXTTAU', 'SUANGSTR',
       'SSSCAT25', 'SSCMASS25', 'SO4SMASS', 'DUSMASS', 'SUFLUXU', 'BCFLUXV',
       'DUSMASS25', 'SSCMASS', 'SUSCATAU', 'SO2SMASS', 'SSANGSTR', 'DUEXTT25',
       'OCFLUXU', 'OCSCATAU', 'TOTSCATAU', 'PM25_MERRA2', 'PM25_ug_m3',
       'class', 'location', 'year', 'month', 'day', 'hour'],
      dtype='object')


In [12]:
# Save to Parquet
df.to_parquet("air_quality_data_all.parquet", engine="pyarrow", index=False)