In [20]:
import pandas as pd
from sklearn.cluster import DBSCAN, KMeans
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('~/Desktop/AirQualityUCI.csv', delimiter=';')

# Preprocess the dataset
df = df.replace(',', '.', regex=True)  # Replace commas with periods

# Select the features for clustering
features = ['CO(GT)', 'PT08.S1(CO)', 'NMHC(GT)', 'C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)', 'PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']

df[features] = df[features].apply(pd.to_numeric)

# Handle NaN values
imputer = SimpleImputer(strategy='mean')  # Use mean imputation
df[features] = imputer.fit_transform(df[features])

df.head()

Unnamed: 0,Date,Time,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
0,10/03/2004,18.00.00,2.6,1360.0,150.0,11.9,1046.0,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,,
1,10/03/2004,19.00.00,2.0,1292.0,112.0,9.4,955.0,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,,
2,10/03/2004,20.00.00,2.2,1402.0,88.0,9.0,939.0,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,,
3,10/03/2004,21.00.00,2.2,1376.0,80.0,9.2,948.0,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,,
4,10/03/2004,22.00.00,1.6,1272.0,51.0,6.5,836.0,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,,


In [21]:
df.describe()

Unnamed: 0,CO(GT),PT08.S1(CO),NMHC(GT),C6H6(GT),PT08.S2(NMHC),NOx(GT),PT08.S3(NOx),NO2(GT),PT08.S4(NO2),PT08.S5(O3),T,RH,AH,Unnamed: 15,Unnamed: 16
count,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,9471.0,0.0,0.0
mean,-34.207524,1048.990061,-159.090093,1.865683,894.595276,168.616971,794.990168,58.148873,1391.479641,975.072032,9.778305,39.48538,-6.837604,,
std,77.188336,327.841433,138.945154,41.130385,340.266507,255.879678,320.049602,126.174086,464.389469,454.179543,42.942793,50.906941,38.741359,,
min,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,-200.0,,
25%,0.6,923.0,-200.0,3.9,713.0,51.0,639.0,54.0,1189.0,703.0,10.7,34.25,0.6768,,
50%,1.5,1050.0,-200.0,7.8,894.595276,144.0,794.990168,95.0,1440.0,949.0,17.0,48.1,0.9711,,
75%,2.6,1218.0,-200.0,13.5,1102.0,281.5,957.0,132.0,1658.0,1250.0,24.0,61.7,1.2915,,
max,11.9,2040.0,1189.0,63.7,2214.0,1479.0,2683.0,340.0,2775.0,2523.0,44.6,88.7,2.231,,


In [22]:
#Prepare the data
data = df[features]

# Apply DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(data)

# Apply K-means clustering
kmeans = KMeans(n_clusters=3)
kmeans_labels = kmeans.fit_predict(data)

# Compare the results
print("DBSCAN Clusters:", len(set(dbscan_labels)))
print("K-means Clusters:", len(set(kmeans_labels)))

DBSCAN Clusters: 4
K-means Clusters: 3
