In [1]:
import pandas as pd
import numpy as np
import os
import pyarrow.parquet as pq
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import plotly.subplots as sp

In [2]:
df = pd.read_parquet('air_quality_data_all.parquet')
df.drop(columns=['PM25_ug_m3', 'DUSMASS25', 'DUSCATAU', 'DUEXTT25', 'DUSCAT25', 'SO4CMASS', 'SSCMASS25', 'SUEXTTAU', 'DUSMASS', 'TOTEXTTAU', 'DUEXTTAU', 'TOTSCATAU', 'DUCMASS25', 'BCEXTTAU', 'BCSCATAU'], inplace=True)

In [None]:
columns = df.columns
print(columns)

Index(['BCFLUXU', 'OCFLUXV', 'BCANGSTR', 'SUFLUXV', 'SSSMASS25', 'SSSMASS',
       'OCSMASS', 'BCCMASS', 'BCSMASS', 'SSFLUXU', 'DUCMASS', 'SSEXTTAU',
       'SO2CMASS', 'OCANGSTR', 'OCCMASS', 'TOTANGSTR', 'DMSCMASS', 'SSEXTT25',
       'DUANGSTR', 'DMSSMASS', 'SSSCATAU', 'DUFLUXV', 'DUFLUXU', 'SSFLUXV',
       'OCEXTTAU', 'SUANGSTR', 'SSSCAT25', 'SO4SMASS', 'SUFLUXU', 'BCFLUXV',
       'SSCMASS', 'SUSCATAU', 'SO2SMASS', 'SSANGSTR', 'OCFLUXU', 'OCSCATAU',
       'PM25_MERRA2', 'class', 'location', 'year', 'month', 'day', 'hour'],
      dtype='object')


In [3]:
# Get numerical features (exclude non-numerical columns)
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
numerical_features = [col for col in numerical_features if col != 'class']

correlation_matrix = df[numerical_features + ['class']].corr()
class_correlations = correlation_matrix['class'].abs().sort_values(ascending=False)
print("Top 15 features most correlated with class:")
print(class_correlations[1:])

Top 15 features most correlated with class:
PM25_MERRA2    0.622601
DUCMASS        0.475727
TOTANGSTR      0.437979
location       0.217848
DUFLUXV        0.215625
SSFLUXV        0.163407
DUFLUXU        0.154075
BCCMASS        0.144977
SSSMASS25      0.129199
SUFLUXV        0.124042
SSSMASS        0.120738
BCFLUXV        0.117109
SSCMASS        0.113901
SUSCATAU       0.108830
DUANGSTR       0.104356
OCCMASS        0.100649
OCEXTTAU       0.100447
SSSCAT25       0.100437
SSEXTT25       0.100437
OCSCATAU       0.100164
OCSMASS        0.089674
SSSCATAU       0.083229
SSEXTTAU       0.083229
OCFLUXV        0.082289
BCSMASS        0.081202
SUFLUXU        0.066260
SO4SMASS       0.060553
BCFLUXU        0.047332
year           0.043398
DMSCMASS       0.034007
SUANGSTR       0.032301
SSFLUXU        0.029181
SSANGSTR       0.024237
month          0.016871
SO2SMASS       0.014225
DMSSMASS       0.012404
SO2CMASS       0.011333
BCANGSTR       0.009506
hour           0.006348
OCFLUXU        0.005

In [4]:
selected_columns = list(class_correlations.head(10).index) + ['year', 'month', 'day', 'hour']
df_final = df[selected_columns]
df_final.head()

Unnamed: 0,class,PM25_MERRA2,DUCMASS,TOTANGSTR,location,DUFLUXV,SSFLUXV,DUFLUXU,BCCMASS,SSSMASS25,year,month,day,hour
0,1,2.542502e-08,0.000155,0.528115,0,-0.000726,-3.1e-05,-0.000417,5.4106e-07,8.753887e-10,2021,10,1,0
1,1,2.691525e-08,0.000176,0.468545,20,-0.000949,-2.7e-05,-0.000443,5.326781e-07,7.853487e-10,2021,10,1,0
2,1,3.182292e-08,0.000221,0.378091,40,-0.001284,-2.2e-05,-0.000523,5.371019e-07,5.966285e-10,2021,10,1,0
3,0,5.008213e-08,0.000303,0.287515,60,-0.001858,-2.9e-05,-0.000738,5.599193e-07,6.477876e-10,2021,10,1,0
4,0,8.30815e-08,0.000361,0.248208,80,-0.002246,-4.3e-05,-0.000807,5.743548e-07,1.155058e-09,2021,10,1,0


In [5]:
df_final = df_final.sort_values(by=['location', 'year', 'month', 'day', 'hour'])
df_final.head()

Unnamed: 0,class,PM25_MERRA2,DUCMASS,TOTANGSTR,location,DUFLUXV,SSFLUXV,DUFLUXU,BCCMASS,SSSMASS25,year,month,day,hour
12854400,1,3.495467e-08,0.0003,0.304043,0,-0.001363,-2.4e-05,-0.000863,5.269066e-07,6.83258e-10,2021,6,3,0
12854420,1,3.391171e-08,0.000282,0.313958,0,-0.001303,-2.2e-05,-0.000621,5.202225e-07,6.930362e-10,2021,6,3,1
12854440,1,3.360293e-08,0.000261,0.319685,0,-0.00123,-2e-05,-0.000517,5.089607e-07,6.821222e-10,2021,6,3,2
12854460,1,3.42109e-08,0.000241,0.322932,0,-0.001146,-1.8e-05,-0.000529,4.947988e-07,6.714345e-10,2021,6,3,3
12854480,0,3.639861e-08,0.000223,0.339931,0,-0.001064,-1.6e-05,-0.000593,4.79808e-07,6.800748e-10,2021,6,3,4


In [6]:
df_final.tail()

Unnamed: 0,class,PM25_MERRA2,DUCMASS,TOTANGSTR,location,DUFLUXV,SSFLUXV,DUFLUXU,BCCMASS,SSSMASS25,year,month,day,hour
12854319,0,5.888581e-08,0.000331,0.475502,399,0.000891,-3.637964e-06,0.002411,8.029993e-07,1.875378e-09,2025,6,1,19
12854339,0,5.184527e-08,0.000332,0.479311,399,0.000935,1.06309e-06,0.002461,8.166112e-07,1.746685e-09,2025,6,1,20
12854359,0,4.524903e-08,0.000343,0.476115,399,0.000796,3.892928e-07,0.002668,8.312599e-07,1.683475e-09,2025,6,1,21
12854379,0,3.939474e-08,0.000352,0.470348,399,0.000502,-2.450863e-06,0.002859,8.375557e-07,1.673016e-09,2025,6,1,22
12854399,1,3.397934e-08,0.000359,0.463809,399,0.000199,-5.697249e-06,0.00302,8.368235e-07,1.668013e-09,2025,6,1,23


In [7]:
df_final.to_parquet('air_quality_all_preproccessed.parquet', index=False)