# Expense Categorization using MiniLM Embeddings, SVM, XGBoost & Ensemble

## 1. Project Overview

This notebook implements an **expense categorization system** that predicts categories such as `Food`, `Transport`, `Shopping`, etc. from raw transaction descriptions (e.g., `"Uber ride to office"`, `"Starbucks latte"`).

The pipeline uses:

- **MiniLM Sentence Embeddings** (`all-MiniLM-L6-v2`) from `sentence-transformers`
- **SVM (RBF kernel)** with hyperparameter tuning
- **XGBoost** classifier
- **Ensemble model** combining SVM + XGBoost probabilities
- **Model evaluation** (accuracy, classification report, confusion matrix)
- **Error / mistake analysis** to inspect misclassified examples

The goal is to build a **practical, accurate, and explainable** classifier suitable for a personal finance assistant or agentic AI system.


In [ ]:
# 2. Install dependencies (run once per environment)
!pip install -q sentence-transformers xgboost

## 3. Imports & Configuration

In [ ]:
import pandas as pd
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

import matplotlib.pyplot as plt

# Configuration
DATA_PATH = "expense_data.csv"  # Update path if needed
RANDOM_STATE = 42

## 4. Load & Clean Dataset

In [ ]:
# Load dataset
df = pd.read_csv(DATA_PATH)

# Basic schema check
assert "Description" in df.columns and "Category" in df.columns, \    "CSV must contain 'Description' and 'Category' columns."

# Drop missing values
df = df.dropna(subset=["Description", "Category"]).reset_index(drop=True)

print("Original category distribution:")
print(df["Category"].value_counts())

# Remove extremely rare categories (< 2 samples)
counts = df["Category"].value_counts()
rare = counts[counts < 2].index
print("\nDropping rare categories:", list(rare))

df = df[~df["Category"].isin(rare)].reset_index(drop=True)

print("\nUpdated category distribution:")
print(df["Category"].value_counts())

## 5. Label Encoding & Train/Test Split

In [ ]:
# Encode string labels -> numeric ids
le = LabelEncoder()
df["label"] = le.fit_transform(df["Category"])

X_text = df["Description"].astype(str).tolist()
y = df["label"].values

# Stratified train-test split to preserve label distribution
X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.2,
    random_state=RANDOM_STATE,
    stratify=y
)

print(f"Train size: {len(X_train_text)}, Test size: {len(X_test_text)}")
print("Classes:", list(le.classes_))

# Labels actually present in the test set (for clean reports)
test_labels = np.unique(y_test)
test_label_names = le.inverse_transform(test_labels)

## 6. MiniLM Embeddings

In [ ]:
# Load MiniLM sentence embedding model
print("Loading MiniLM encoder...")
encoder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Encode train & test descriptions into dense vectors
print("\nEncoding training descriptions...")
X_train_emb = encoder.encode(
    X_train_text,
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True
)

print("\nEncoding test descriptions...")
X_test_emb = encoder.encode(
    X_test_text,
    batch_size=32,
    convert_to_numpy=True,
    show_progress_bar=True
)

print("\nTrain embedding shape:", X_train_emb.shape)
print("Test embedding shape:", X_test_emb.shape)

## 7. Feature Scaling for SVM

In [ ]:
# SVM is sensitive to feature scale, so we standardize embeddings
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_emb)
X_test_scaled = scaler.transform(X_test_emb)

## 8. Train SVM with Hyperparameter Tuning (GridSearchCV)

In [ ]:
svm = SVC(kernel="rbf", probability=True, random_state=RANDOM_STATE)

param_grid = {
    "C": [0.5, 1, 3, 5],
    "gamma": ["scale", 0.1, 0.01]
}

print("Running GridSearchCV for SVM...")

grid_svm = GridSearchCV(
    svm,
    param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_svm.fit(X_train_scaled, y_train)

best_svm = grid_svm.best_estimator_
print("\nBest SVM parameters:", grid_svm.best_params_)

## 9. Train XGBoost Classifier

In [ ]:
print("Training XGBoost classifier...")

xgb = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric="mlogloss",
    random_state=RANDOM_STATE
)

xgb.fit(X_train_emb, y_train)

## 10. Model Evaluation: SVM, XGBoost, and Ensemble

In [ ]:
# ---- SVM Evaluation ----
y_pred_svm = best_svm.predict(X_test_scaled)
acc_svm = accuracy_score(y_test, y_pred_svm)

print("\n=================== SVM PERFORMANCE ===================")
print(f"Accuracy: {acc_svm:.4f}")
print(classification_report(
    y_test,
    y_pred_svm,
    labels=test_labels,
    target_names=test_label_names
))

# ---- XGBoost Evaluation ----
y_pred_xgb = xgb.predict(X_test_emb)
acc_xgb = accuracy_score(y_test, y_pred_xgb)

print("\n=================== XGBOOST PERFORMANCE ===================")
print(f"Accuracy: {acc_xgb:.4f}")
print(classification_report(
    y_test,
    y_pred_xgb,
    labels=test_labels,
    target_names=test_label_names
))

# ---- Ensemble (Average Probabilities) ----
proba_svm = best_svm.predict_proba(X_test_scaled)
proba_xgb = xgb.predict_proba(X_test_emb)

proba_ensemble = (proba_svm + proba_xgb) / 2.0
y_pred_ens = np.argmax(proba_ensemble, axis=1)
acc_ens = accuracy_score(y_test, y_pred_ens)

print("\n=================== ENSEMBLE (SVM + XGBoost) PERFORMANCE ===================")
print(f"Accuracy: {acc_ens:.4f}")
print(classification_report(
    y_test,
    y_pred_ens,
    labels=test_labels,
    target_names=test_label_names
))

## 11. Confusion Matrix (Ensemble)

In [ ]:
cm = confusion_matrix(y_test, y_pred_ens, labels=test_labels)

plt.figure(figsize=(10, 8))
plt.imshow(cm, cmap="Blues")
plt.title("Confusion Matrix - Ensemble (SVM + XGBoost)")
plt.colorbar()

tick_marks = np.arange(len(test_label_names))
plt.xticks(tick_marks, test_label_names, rotation=45, ha="right")
plt.yticks(tick_marks, test_label_names)

thresh = cm.max() / 2.0
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(
            j, i, cm[i, j],
            ha="center", va="center",
            color="white" if cm[i, j] > thresh else "black"
        )

plt.xlabel("Predicted label")
plt.ylabel("True label")
plt.tight_layout()
plt.show()

## 12. Mistake Analysis

In [ ]:
results_df = pd.DataFrame({
    "Description": X_test_text,
    "True_label_id": y_test,
    "Pred_svm_id": y_pred_svm,
    "Pred_xgb_id": y_pred_xgb,
    "Pred_ens_id": y_pred_ens
})

results_df["True_label"] = le.inverse_transform(results_df["True_label_id"])
results_df["Pred_svm"] = le.inverse_transform(results_df["Pred_svm_id"])
results_df["Pred_xgb"] = le.inverse_transform(results_df["Pred_xgb_id"])
results_df["Pred_ens"] = le.inverse_transform(results_df["Pred_ens_id"])

mistakes = results_df[results_df["True_label"] != results_df["Pred_ens"]]

print("Number of misclassified examples (ensemble):", len(mistakes))
print("\nSample misclassifications:")
mistakes.head(15)[[
    "Description", "True_label", "Pred_svm", "Pred_xgb", "Pred_ens"
]]

## 13. Helper Function for Single Prediction

In [ ]:
def predict_category(description: str):
    """Predict expense category for a single description using the ensemble model."""
    emb = encoder.encode([description], convert_to_numpy=True)
    emb_scaled = scaler.transform(emb)
    
    proba_svm = best_svm.predict_proba(emb_scaled)
    proba_xgb = xgb.predict_proba(emb)
    proba_ensemble = (proba_svm + proba_xgb) / 2.0
    
    label_id = np.argmax(proba_ensemble, axis=1)[0]
    return le.inverse_transform([label_id])[0]

# Example
example = "Starbucks coffee and sandwich"
print("Description:", example)
print("Predicted category:", predict_category(example))