In [None]:
#Reading the dataset
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
ds = pd.read_csv(r"C:\Users\Zunnurain.Badar\Mall_Customers.csv")
print(ds.columns)

In [None]:
#Outliers detection
plt.figure(figsize=(8,4))
sns.boxplot(data=ds[['Age','Annual Income (k$)', 'Spending Score (1-100)']])
plt.show()

In [None]:
#checking the skewness of data graphically
ds.hist(figsize=(8,4),bins=20)
plt.show()

In [None]:
#checking the skewness of data
print(ds[['Annual Income (k$)','Age','Spending Score (1-100)']].skew())


In [None]:
#Selecting only numeric cols
ds_selected = ds[['Annual Income (k$)','Age','Spending Score (1-100)']]
ds_selected

In [None]:
#scaling the dataset for kmeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_ds = scaler.fit_transform(ds_selected)
scaled_ds = pd.DataFrame(scaled_ds , columns = ds_selected.columns)
print(scaled_ds.head())

In [None]:
print(scaled_ds.describe)

In [None]:
#Applying elbow method for finding number of k
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

inertia_values = []
k_range = range(1, 11) 

for k in k_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(scaled_ds)  
    inertia_values.append(kmeans.inertia_)  

plt.plot(k_range, inertia_values, marker='o')

plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.title("Elbow Method")
plt.show()

In [None]:
#Training the k means model
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 4 , random_state = 42, n_init = 10)
kmeans.fit(scaled_ds)
cluster_labels = kmeans.labels_
ds_selected.loc[:, "clusters"] = cluster_labels  
print(ds_selected.head(200))


In [None]:
#Data visuakization through scatter 3D plot
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure(figsize = (15,15))
ax = fig.add_subplot(111 , projection = '3d')
scatter = ax.scatter(ds_selected["Annual Income (k$)"],ds_selected["Age"],ds_selected["Spending Score (1-100)"],c=ds_selected["clusters"],cmap ='rainbow',s=60)
ax.set_xlabel("Annual Income (k$)")
ax.set_ylabel("Age")
ax.set_zlabel("Spending Score (1-100)")
ax.set_title("customer segmentation")
plt.show()



In [None]:
#Finding mean for all clusters
ds_selected.groupby("clusters").mean()

In [None]:
#Finding the percentage of each cluster
ds_selected['clusters'].value_counts(normalize=True) * 100

## 🔍 Insights from Customer Segmentation

### 📌 Cluster 0: Older Customers, Low Spending
- **Average Income:** 47.7k  
- **Average Age:** 53.9 years  
- **Spending Score:** 39.9  
-  Budget deals and discounts should be offered to them

### 📌 Cluster 1: Young, High Spending Customers
- **Average Income:** 86.1k  
- **Average Age:** 32.8 years  
- **Spending Score:** 81.5  
-  Expensive aur premium products should be offered to them

### 📌 Cluster 2: Middle-Class, Balanced Spending
- **Average Income:** 40k  
- **Average Age:** 25.4 years  
- **Spending Score:** 60.2  
-  Exclusive deals should be offered to them

### 📌 Cluster 3: High Income, Low Spending
- **Average Income:** 86.5k  
- **Average Age:** 39.3 years  
- **Spending Score:** 19.5  
-  Luxurious and premium products should be offered to them


In [None]:
pip install streamlit


In [None]:
 %%writefile app.py
import streamlit as st
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

st.title("Customer Segmentation using K-Means Clustering")

# Upload CSV File
uploaded_file = st.file_uploader("Upload your dataset (CSV format)", type=["csv"])

if uploaded_file is not None:
    df = pd.read_csv(uploaded_file)
    st.write("### Preview of Uploaded Dataset:")
    st.write(df.head())

    # Select Features
    st.write("### Select Features for Clustering:")
    selected_features = st.multiselect("Choose columns", df.columns)

    if len(selected_features) >= 2:
        df_selected = df[selected_features].dropna()

        # Select Number of Clusters
        k = st.slider("Select number of clusters (K)", min_value=2, max_value=10, value=4)
        kmeans = KMeans(n_clusters=k, random_state=42)
        df_selected["Cluster"] = kmeans.fit_predict(df_selected)

        # Cluster Means
        cluster_means = df_selected.groupby("Cluster").mean()

        # Assigning Labels
        cluster_names = {}
        for i, row in cluster_means.iterrows():
            if row["Spending Score (1-100)"] > 60 and row["Age"] < 35:
                cluster_names[i] = "Young High Spenders"
            elif row["Spending Score (1-100)"] < 40 and row["Annual Income (k$)"] > 60:
                cluster_names[i] = "Rich Low Spenders"
            elif row["Annual Income (k$)"] < 40:
                cluster_names[i] = "Low Income Group"
            else:
                cluster_names[i] = "Average Spenders"

        df_selected["Cluster Name"] = df_selected["Cluster"].map(cluster_names)

        # Show Updated Data
        st.write("### Clustered Data with Labels:")
        st.write(df_selected)

        # Show Cluster Summary
        st.write("### Cluster Insights:")
        for i, name in cluster_names.items():
            st.write(f"**Cluster {i}: {name}** - {len(df_selected[df_selected['Cluster'] == i])} Customers")

        # Visualization
        if len(selected_features) == 2:
            st.write("### Cluster Visualization:")
            fig, ax = plt.subplots()
            scatter = ax.scatter(df_selected.iloc[:, 0], df_selected.iloc[:, 1], c=df_selected["Cluster"], cmap="viridis")
            ax.set_xlabel(selected_features[0])
            ax.set_ylabel(selected_features[1])
            plt.colorbar(scatter)
            st.pyplot(fig)

    else:
        st.warning("Please select at least 2 features for clustering.")


In [None]:
%system streamlit run app.py
