In [2]:
# ===============================
# IMPORT LIBRARIES
# ===============================
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib

# ===============================
# LOAD DATASET (RELATIVE PATH ✅)
# ===============================
df = pd.read_csv(r"C:\Users\Bhavani\OneDrive\Documents\ml-tekwork\Unsupervised\Mall_customer\Mall_Customers (3).csv")

print(df.info())
print(df.head())

# ===============================
# DROP UNWANTED COLUMNS
# ===============================
df.drop(["Gender", "CustomerID"], axis=1, inplace=True)

# ===============================
# SCALING
# ===============================
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df)

# ===============================
# KMEANS CLUSTERING
# ===============================
kmeans = KMeans(n_clusters=5, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

# Save clustered (UNSCALED) data for visualization
clustered_df = df.copy()
clustered_df["Cluster"] = clusters
clustered_df.to_csv("clustered_mall_customers.csv", index=False)

# ===============================
# SUPERVISED LEARNING
# ===============================
X = X_scaled
y = clusters

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

dt_model = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_model.fit(X_train, y_train)

# ===============================
# EVALUATION
# ===============================
dt_pred = dt_model.predict(X_test)
dt_acc = accuracy_score(y_test, dt_pred)
print("Decision Tree Accuracy:", dt_acc)

# ===============================
# SAVE MODEL & SCALER
# ===============================
joblib.dump(dt_model, "dt_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("✅ Model, scaler, and clustered data saved successfully")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 5 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   CustomerID              200 non-null    int64 
 1   Gender                  200 non-null    object
 2   Age                     200 non-null    int64 
 3   Annual Income (k$)      200 non-null    int64 
 4   Spending Score (1-100)  200 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 7.9+ KB
None
   CustomerID  Gender  Age  Annual Income (k$)  Spending Score (1-100)
0           1    Male   19                  15                      39
1           2    Male   21                  15                      81
2           3  Female   20                  16                       6
3           4  Female   23                  16                      77
4           5  Female   31                  17                      40




Decision Tree Accuracy: 0.96
✅ Model, scaler, and clustered data saved successfully
