In [5]:
import dask.dataframe as dd
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from joblib import Parallel, delayed
import warnings

In [6]:
warnings.filterwarnings('ignore')

# Load the dataset using Dask for scalability
data = dd.read_csv('air_quality_data.csv')

In [10]:
# Data Overview
print(data.info())
print(data.describe().compute())
print(data.head())

<class 'dask.dataframe.core.DataFrame'>
Columns: 6 entries, PM2.5 to O3
dtypes: float64(6)None
            PM2.5        PM10         NO2         SO2          CO          O3
count  100.000000  100.000000  100.000000  100.000000  100.000000  100.000000
mean    48.442302   80.446092   30.648963   10.534201    0.983199   24.077555
std     13.622526   19.073379   10.842829    4.420507    0.319119    7.388300
min     10.703823   41.624576   -2.412673   -0.619479    0.309424    5.226844
25%     40.986415   63.886790   23.445565    7.164903    0.732495   19.763736
50%     48.095656   81.682143   30.976957   10.250787    0.977230   24.112972
75%     56.089281   90.763409   37.044374   13.420004    1.195435   29.065111
max     77.784173  134.403383   68.527315   20.949015    1.923664   43.165543
       PM2.5       PM10        NO2        SO2        CO         O3
0  57.450712  51.692585  33.577874   5.855025  0.521672  32.409420
1  47.926035  71.587094  35.607845   7.199095  0.820187  40.275333
2 

In [13]:
# Handle missing values efficiently

#Fills missing values using the previous value
data = data.fillna(method='ffill')
#To delete missing rows
# data = data.dropna()
#To fill the missing with the mean
# data = data.fillna(data.mean())

In [16]:
# Exploratory Data Analysis (EDA) using Plotly for interactivity
fig = px.histogram(data.compute(), x='PM2.5', nbins=50, title='Distribution of PM2.5')
fig.show()

In [19]:
# Correlation Heatmap
corr_matrix = data.corr().compute()
fig = px.imshow(corr_matrix, text_auto=True, title='Correlation Heatmap')
fig.show()

In [20]:
# Data Preprocessing
features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'O3']
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data[features].compute())

In [21]:
# Dimensionality Reduction with PCA
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)

In [22]:
# Clustering with MiniBatchKMeans for large datasets
kmeans = MiniBatchKMeans(n_clusters=3, random_state=42, batch_size=100)
clusters = kmeans.fit_predict(pca_data)

In [23]:
# Visualization of Clusters using Plotly
fig = px.scatter(x=pca_data[:, 0], y=pca_data[:, 1], color=clusters,
                 labels={'x': 'Principal Component 1', 'y': 'Principal Component 2'},
                 title='Air Quality Clusters')
fig.show()

In [24]:
# Feature Importance
explained_variance = pca.explained_variance_ratio_
print(f'Explained variance by components: {explained_variance}')


Explained variance by components: [0.25068495 0.20897311]
