In [1]:
# 1. Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# 2. Load the dataset
# If you already have the dataset locally, replace the URL with your path
url = "/content/drive/MyDrive/ml-practical/dataset"
data = pd.read_csv(url)

print("First 5 rows of dataset:")
display(data.head())

# 3. Select relevant features for clustering
X = data[['Annual Income (k$)', 'Spending Score (1-100)']]

# 4. Standardize the data for DBSCAN
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 5. Apply DBSCAN clustering
# eps = neighborhood radius, min_samples = minimum points to form a cluster
dbscan = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan.fit_predict(X_scaled)

# Add cluster labels to the original dataset
data['Cluster'] = clusters

# 6. Analyze cluster distribution
print("\nCluster Label Distribution:")
print(data['Cluster'].value_counts())

# -1 represents noise or outliers
n_clusters = len(set(clusters)) - (1 if -1 in clusters else 0)
print(f"\nNumber of clusters found (excluding noise): {n_clusters}")

# 7. Visualize the clusters
plt.figure(figsize=(8,6))
sns.scatterplot(
    x='Annual Income (k$)',
    y='Spending Score (1-100)',
    hue='Cluster',
    palette='viridis',
    data=data,
    s=80,
    edgecolor='k'
)
plt.title("DBSCAN Clustering - Mall Customers (Income vs Spending Score)")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1–100)")
plt.legend(title="Cluster")
plt.show()

# 8. Evaluate clustering performance (ignoring noise points)
mask = data['Cluster'] != -1
if data['Cluster'][mask].nunique() > 1:
    sil_score = silhouette_score(X_scaled[mask], data['Cluster'][mask])
    print(f"\nSilhouette Score (excluding noise): {sil_score:.3f}")
else:
    print("\nNot enough clusters for silhouette evaluation.")

# 9. Visualize outliers separately
plt.figure(figsize=(8,6))
sns.scatterplot(
    x='Annual Income (k$)', y='Spending Score (1-100)',
    data=data[data['Cluster'] == -1],
    color='red', s=100, edgecolor='k', label='Noise/Outliers'
)
plt.scatter(
    data[data['Cluster'] != -1]['Annual Income (k$)'],
    data[data['Cluster'] != -1]['Spending Score (1-100)'],
    c=data[data['Cluster'] != -1]['Cluster'],
    cmap='viridis', s=60, edgecolor='k'
)
plt.title("Outlier Detection using DBSCAN")
plt.xlabel("Annual Income (k$)")
plt.ylabel("Spending Score (1–100)")
plt.legend()
plt.show()

# 10. Cluster Insights
cluster_summary = data.groupby('Cluster')[['Annual Income (k$)', 'Spending Score (1-100)']].mean().round(1)
print("\nCluster Summary (Mean values):")
display(cluster_summary)


HTTPError: HTTP Error 404: Not Found