In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('mall_customer.csv')

# Select features
features = df[['Annual Income (k$)', 'Spending Score (1-100)']].copy()

# Standardize using StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)
scaled_df = pd.DataFrame(scaled_features, columns=features.columns)

# Outlier removal using IQR
Q1 = scaled_df.quantile(0.25)
Q3 = scaled_df.quantile(0.75)
IQR = Q3 - Q1

# Keep rows within 1.5*IQR
filtered_df = scaled_df[~((scaled_df < (Q1 - 1.5 * IQR)) | (scaled_df > (Q3 + 1.5 * IQR))).any(axis=1)]

# Apply KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(filtered_df)

# Add cluster info for plotting
filtered_df['Cluster'] = clusters

# Plot
plt.figure(figsize=(8,6))
sns.scatterplot(data=filtered_df, x='Annual Income (k$)', y='Spending Score (1-100)', hue='Cluster', palette='Set2')
plt.title("KMeans Clustering after Standardization & Outlier Removal")
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'mall_customer.csv'