# Customer Segmentation (K-Means Clustering)

Goal: Group customers based on similarity using their Annual Income and Spending Score.

Dataset: Mall_Customers.csv  
Columns we care about for clustering:
- Annual Income (k$)
- Spending Score (1-100)

Steps:
1. Load & inspect
2. Preprocess (check missing, select features)
3. Elbow Method to choose k
4. Apply K-Means and visualize
5. Interpret clusters

# Imports

In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Load and Explore the Dataset

In [34]:
# Load dataset
data_path = "/content/transactions_fraud_data.csv"
df = pd.read_csv(data_path)

In [35]:
# Display first 10 rows
display(df.head(10))

Unnamed: 0,TransactionID,Amount,TransactionHour,LocationID,IsFraud
0,T0001,34,10,2,0
1,T0002,58,14,3,0
2,T0003,120,11,5,0
3,T0004,89,16,1,0
4,T0005,45,9,4,0
5,T0006,210,18,2,0
6,T0007,850,2,9,1
7,T0008,76,13,3,0
8,T0009,59,15,6,0
9,T0010,140,12,2,0


In [36]:
# Display Basic statistics for Amount & TransactionHour
display(df[["Amount", "TransactionHour"]].describe())

Unnamed: 0,Amount,TransactionHour
count,200.0,200.0
mean,169.595,13.175
std,181.328157,4.682307
min,33.0,0.0
25%,66.0,10.0
50%,113.0,13.0
75%,190.0,17.0
max,980.0,23.0


In [37]:
# Display Fraud class distribution
display(df["IsFraud"].value_counts())

Unnamed: 0_level_0,count
IsFraud,Unnamed: 1_level_1
0,182
1,18


In [38]:
# Display Summary metrics in a small DataFrame
summary = pd.DataFrame({
    "Fraud rate (%)": [df["IsFraud"].mean() * 100],
    "Average amount": [df["Amount"].mean()],
    "Average transaction hour": [df["TransactionHour"].mean()]
})

display(summary)

Unnamed: 0,Fraud rate (%),Average amount,Average transaction hour
0,9.0,169.595,13.175


# Preprocess the Data

In [39]:
# Check for missing values
missing_values = df.isna().sum().to_frame(name="Missing count")
display(missing_values)

Unnamed: 0,Missing count
TransactionID,0
Amount,0
TransactionHour,0
LocationID,0
IsFraud,0


In [40]:
# Features and target
X = df[["Amount", "TransactionHour", "LocationID"]]
y = df["IsFraud"]

# Define which columns are numeric vs categorical
numeric_features = ["Amount", "TransactionHour"]
categorical_features = ["LocationID"]  # treat as categorical even if numeric codes

# Preprocessing: scale numeric, one-hot encode location
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

# Train/test split (stratify to preserve fraud proportion)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# Show train/test sizes as a small DataFrame
split_summary = pd.DataFrame({
    "Train size": [X_train.shape[0]],
    "Test size": [X_test.shape[0]]
})
display(split_summary)

Unnamed: 0,Train size,Test size
0,150,50


# Train Classification Models


In [41]:
# Logistic Regression pipeline
logreg_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", LogisticRegression(max_iter=500, random_state=42))
])

logreg_pipeline.fit(X_train, y_train)

# Random Forest pipeline (trees don't need scaling, but we reuse pipeline for consistency)
rf_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", RandomForestClassifier(
        n_estimators=120,
        random_state=42
    ))
])

rf_pipeline.fit(X_train, y_train)

# Evaluate the Models

In [42]:
def evaluate(name, model, Xtr, Xte, ytr, yte):
    y_pred_train = model.predict(Xtr)
    y_pred_test = model.predict(Xte)

    acc_train = accuracy_score(ytr, y_pred_train)
    acc_test = accuracy_score(yte, y_pred_test)

    # Accuracy summary as DataFrame
    acc_summary = pd.DataFrame({
        "Model": [name],
        "Train Accuracy": [acc_train],
        "Test Accuracy": [acc_test]
    })
    display(acc_summary)

    # Confusion matrix as DataFrame
    cm = confusion_matrix(yte, y_pred_test)
    cm_df = pd.DataFrame(
        cm,
        index=["True: 0", "True: 1"],
        columns=["Pred: 0", "Pred: 1"]
    )
    display(cm_df)


# Evaluate models
evaluate("Logistic Regression", logreg_pipeline, X_train, X_test, y_train, y_test)
evaluate("Random Forest", rf_pipeline, X_train, X_test, y_train, y_test)

# Extract transformed feature names (for later feature importance / coefficients)
ohe = logreg_pipeline.named_steps["preprocess"].named_transformers_["cat"]
location_feature_names = ohe.get_feature_names_out(["LocationID"])
all_feature_names = numeric_features + list(location_feature_names)


Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Logistic Regression,0.986667,0.98


Unnamed: 0,Pred: 0,Pred: 1
True: 0,46,0
True: 1,1,3


Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Random Forest,1.0,1.0


Unnamed: 0,Pred: 0,Pred: 1
True: 0,46,0
True: 1,0,4


# Interpret the Results

In [43]:
# Coefficients (logistic regression)
logreg_coef = logreg_pipeline.named_steps["model"].coef_[0]

# Feature importances (random forest)
rf_feature_importances = rf_pipeline.named_steps["model"].feature_importances_

# Logistic regression coefficients DataFrame
coef_df = pd.DataFrame({
    "feature": all_feature_names,
    "logreg_coefficient": logreg_coef
}).sort_values(by="logreg_coefficient", ascending=False)

# Random forest feature importances DataFrame
rf_importances_df = pd.DataFrame({
    "feature": all_feature_names,
    "rf_importance": rf_feature_importances
}).sort_values(by="rf_importance", ascending=False)

# Display
display(coef_df)
display(rf_importances_df)

Unnamed: 0,feature,logreg_coefficient
0,Amount,1.984997
9,LocationID_9,0.891873
10,LocationID_10,0.582396
8,LocationID_8,0.155053
7,LocationID_6,-0.169214
2,LocationID_1,-0.246935
6,LocationID_5,-0.267365
5,LocationID_4,-0.297755
4,LocationID_3,-0.305754
3,LocationID_2,-0.34112


Unnamed: 0,feature,rf_importance
0,Amount,0.365117
1,TransactionHour,0.32364
9,LocationID_9,0.145171
8,LocationID_8,0.080187
10,LocationID_10,0.077779
4,LocationID_3,0.002663
3,LocationID_2,0.002651
2,LocationID_1,0.001366
5,LocationID_4,0.000895
6,LocationID_5,0.000516


# Interpretation