# Automated Purchase Order (PO) Classification

In [None]:
import os
import glob
import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score
from xgboost import XGBClassifier
from scipy.sparse import hstack
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import plotly.express as px

In [2]:
# Folder containing your CSV files
dataset_path = "dataset/"
all_files = glob.glob(os.path.join(dataset_path, "*.csv"))

df_list = []

# Function to normalize column names
def normalize_columns(df):
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

# Load and normalize all files
for f in all_files:
    try:
        temp_df = pd.read_csv(f, encoding='utf-8', index_col=False)
    except UnicodeDecodeError:
        temp_df = pd.read_csv(f, encoding='ISO-8859-1', index_col=False)
    
    temp_df = normalize_columns(temp_df)
    df_list.append(temp_df)

# Combine all CSVs into one DataFrame
df = pd.concat(df_list, ignore_index=True)

# Ensure important columns exist
for col in ['account_name', 'supplier_name', 'total_value']:
    if col not in df.columns:
        df[col] = ""
    df[col] = df[col].fillna("").astype(str).str.strip()

# Create PO description for NLP
df['po_description'] = df['account_name'] + " | " + df['supplier_name'] + " | " + df['total_value']

# Check result
print(df[['po_description']].head())

                                      po_description
0       RENTS | M N PROPERTY CONSULTANTS | £8,875.00
1      MINOR BUILDING WORKS | C E S LTD | £54,065.77
2  PROPERTY - WORKS ELEMENT | ELECTRIC CENTER | £...
3  MINOR BUILDING WORKS | OPS ENVIRONMENTAL SERVI...
4             YOUNG PEOPLES RENTS | HOST | £6,006.00


In [3]:
def clean_text(text):
    text = text.lower()  # lowercase
    text = re.sub(r'[^a-z0-9\s]', ' ', text)  # remove punctuation/special chars
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip()

df['po_description_clean'] = df['po_description'].apply(clean_text)

# Quick look
print(df[['po_description_clean', 'account_name']].head())

                                po_description_clean              account_name
0            rents m n property consultants 8 875 00                     RENTS
1           minor building works c e s ltd 54 065 77      MINOR BUILDING WORKS
2   property works element electric center 10 000 00  PROPERTY - WORKS ELEMENT
3  minor building works ops environmental service...      MINOR BUILDING WORKS
4                  young peoples rents host 6 006 00       YOUNG PEOPLES RENTS


In [4]:
# Keep only labels with at least 2 occurrences
label_counts = df['account_name'].value_counts()
valid_labels = label_counts[label_counts >= 2].index
df_filtered = df[df['account_name'].isin(valid_labels)]

X = df_filtered['po_description_clean']
y = df_filtered['account_name']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)

Train size: (2208,)
Test size: (553,)


In [5]:
# Pipeline: TF-IDF vectorizer + Logistic Regression
model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ('clf', LogisticRegression(max_iter=1000))
])

# Train the model
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Overall accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\n=== Overall Accuracy: {accuracy:.2%} ===\n")

# Classification report (suppress warnings for rare classes)
report = classification_report(y_test, y_pred, zero_division=0)
print("=== Detailed Class Performance ===")
print(report)


=== Overall Accuracy: 91.86% ===

=== Detailed Class Performance ===
                                        precision    recall  f1-score   support

                         ACCOMMODATION       1.00      1.00      1.00        10
                            ACTIVITIES       0.70      1.00      0.82        16
                           ADAPTATIONS       0.00      0.00      0.00         1
                     ADOPTION PAYMENTS       1.00      1.00      1.00         3
                           ADVERTISING       0.00      0.00      0.00         1
                          AGENCY STAFF       0.96      1.00      0.98        22
                     AGILISYS CONTRACT       1.00      1.00      1.00         1
                                  AGMA       0.00      0.00      0.00         1
                            ALLOWANCES       0.00      0.00      0.00         1
      ASSET MANAGEMENT PLANS & SURVEYS       0.00      0.00      0.00         1
                          BANK CHARGES       1.00

In [6]:
# Count labels
label_counts = df['account_name'].value_counts()

# Define threshold (e.g., less than 5 occurrences = rare)
threshold = 5
rare_labels = label_counts[label_counts < threshold].index

# Replace rare labels with 'Other'
df['account_name_clean'] = df['account_name'].apply(lambda x: 'Other' if x in rare_labels else x)

# Check counts
print(df['account_name_clean'].value_counts())

account_name_clean
MINOR BUILDING WORKS              302
TAXI HIRE                         238
Other                             183
FOSTER PARENTS BASIC ALLOWANCE    124
AGENCY STAFF                      109
                                 ... 
NEW CONSTRUCTION                    5
BANK CHARGES                        5
PURCHASE OF ICT SOFTWARE            5
TREES, SHRUBS, PLANTS               5
SITE INVESTIGATION                  5
Name: count, Length: 81, dtype: int64


In [7]:
# Extract numeric value from total_value
df['amount'] = df['total_value'].str.replace(r'[^0-9.]', '', regex=True).astype(float)

# Flag if the PO description mentions "rent" or "salary"
df['keyword_rent'] = df['po_description_clean'].apply(lambda x: 1 if 'rent' in x else 0)
df['keyword_salary'] = df['po_description_clean'].apply(lambda x: 1 if 'salary' in x else 0)

# Quick check
print(df[['po_description_clean', 'amount', 'keyword_rent', 'keyword_salary']].head())

                                po_description_clean    amount  keyword_rent  \
0            rents m n property consultants 8 875 00   8875.00             1   
1           minor building works c e s ltd 54 065 77  54065.77             0   
2   property works element electric center 10 000 00  10000.00             0   
3  minor building works ops environmental service...  15800.00             0   
4                  young peoples rents host 6 006 00   6006.00             1   

   keyword_salary  
0               0  
1               0  
2               0  
3               0  
4               0  


In [8]:
# Split dataset
X = df['po_description_clean']
y = df['account_name_clean']  # cleaned labels with rare classes handled
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Engineered features
X_train_extra = np.stack([
    df.loc[X_train.index, 'amount'].fillna(0),
    df.loc[X_train.index, 'keyword_rent'].fillna(0),
    df.loc[X_train.index, 'keyword_salary'].fillna(0)
], axis=1)
X_test_extra = np.stack([
    df.loc[X_test.index, 'amount'].fillna(0),
    df.loc[X_test.index, 'keyword_rent'].fillna(0),
    df.loc[X_test.index, 'keyword_salary'].fillna(0)
], axis=1)

# Combine TF-IDF + engineered features
X_train_combined = hstack([X_train_tfidf, X_train_extra])
X_test_combined = hstack([X_test_tfidf, X_test_extra])

# Encode labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

# Train XGBoost classifier
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_model.fit(X_train_combined, y_train_enc)

# Predict & decode labels
y_pred_enc = xgb_model.predict(X_test_combined)
y_pred = le.inverse_transform(y_pred_enc)

# Evaluate
print(f"\n=== Accuracy: {accuracy_score(y_test, y_pred):.2%} ===\n")
print(classification_report(y_test, y_pred, zero_division=0))

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



=== Accuracy: 98.39% ===

                                        precision    recall  f1-score   support

                         ACCOMMODATION       1.00      1.00      1.00        10
                            ACTIVITIES       1.00      1.00      1.00        16
                     ADOPTION PAYMENTS       1.00      1.00      1.00         3
                          AGENCY STAFF       1.00      1.00      1.00        22
                     AGILISYS CONTRACT       1.00      1.00      1.00         1
      ASSET MANAGEMENT PLANS & SURVEYS       0.00      0.00      0.00         1
                          BANK CHARGES       1.00      1.00      1.00         1
                        BASIC SALARIES       1.00      1.00      1.00         1
    CAPITAL - STAFFING AGENCY PAYMENTS       1.00      1.00      1.00         2
                      CAPITAL SALARIES       1.00      1.00      1.00         6
 COMMUNICATION AND COMPUTING - GENERAL       1.00      1.00      1.00         2
            

In [None]:
# Distribution of predicted labels
pred_counts = pd.Series(y_pred).value_counts().sort_values(ascending=False)
fig = px.bar(
    x=pred_counts.index,
    y=pred_counts.values,
    text=pred_counts.values,
    labels={'x': 'PO Category', 'y': 'Count'},
    title='Predicted PO Category Distribution'
)

fig.update_traces(textposition='outside')
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [None]:
# Get feature names from TF-IDF
feature_names = tfidf.get_feature_names_out()
# Get average importance per feature (approximation using model coefficients)
# Only works if using linear model; for tree-based models, we need feature_importances_
# Here we use XGBoost feature importance
importances = xgb_model.feature_importances_[:len(feature_names)]
top_idx = np.argsort(importances)[-20:]  # top 20
top_features = [feature_names[i] for i in top_idx]
top_importances = importances[top_idx]

fig = px.bar(
    x=top_features,
    y=top_importances,
    labels={'x': 'Feature', 'y': 'Importance'},
    title='Top 20 TF-IDF Features'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

In [None]:
# 1️⃣ Predictions
# Logistic Regression
y_pred_lr = model.predict(X_test)  # your pipeline

# XGBoost
y_pred_xgb_enc = xgb_model.predict(X_test_combined)
y_pred_xgb = le.inverse_transform(y_pred_xgb_enc)

# 2️⃣ Compute metrics
metrics = {
    'Model': ['Logistic Regression']*3 + ['XGBoost']*3,
    'Metric': ['Accuracy', 'Macro F1', 'Weighted F1']*2,
    'Value': [
        accuracy_score(y_test, y_pred_lr),
        f1_score(y_test, y_pred_lr, average='macro', zero_division=0),
        f1_score(y_test, y_pred_lr, average='weighted', zero_division=0),
        accuracy_score(y_test, y_pred_xgb),
        f1_score(y_test, y_pred_xgb, average='macro', zero_division=0),
        f1_score(y_test, y_pred_xgb, average='weighted', zero_division=0)
    ]
}

df_metrics = pd.DataFrame(metrics)
df_metrics['Value_percent'] = df_metrics['Value'] * 100  # convert to %

# 3️⃣ Plot comparison
fig = px.bar(
    df_metrics,
    x='Metric',
    y='Value_percent',
    color='Model',
    barmode='group',
    text='Value_percent',
    labels={'Value_percent': 'Score (%)'},
    title='Model Performance Comparison: Logistic Regression vs XGBoost'
)

fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.update_layout(yaxis=dict(range=[0, 105]))
fig.show()

In [None]:
# Features and labels
X_text = df['po_description_clean'].tolist()
y_labels = df['account_name_clean'].tolist()

# Split dataset
X_train_text, X_test_text, y_train_labels, y_test_labels = train_test_split(
    X_text, y_labels, test_size=0.2, random_state=42, stratify=y_labels
)

In [None]:
label_encoder_bert = LabelEncoder()
y_train_enc_bert = label_encoder_bert.fit_transform(y_train_labels)
y_test_enc_bert = label_encoder_bert.transform(y_test_labels)
num_classes_bert = len(label_encoder_bert.classes_)

In [None]:
tokenizer_bert = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

train_encodings_bert = tokenizer_bert(
    X_train_text,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)

test_encodings_bert = tokenizer_bert(
    X_test_text,
    truncation=True,
    padding=True,
    max_length=128,
    return_tensors='pt'
)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
class POCDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}, torch.tensor(self.labels[idx])

train_dataset_bert = POCDataset(train_encodings_bert, y_train_enc_bert)
test_dataset_bert = POCDataset(test_encodings_bert, y_test_enc_bert)

In [None]:
model_bert = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=num_classes_bert
)

In [None]:
train_loader_bert = DataLoader(train_dataset_bert, batch_size=16, shuffle=True)
test_loader_bert = DataLoader(test_dataset_bert, batch_size=16)

optimizer_bert = AdamW(model_bert.parameters(), lr=5e-5)

# Use Metal backend on Mac M2 if available
device_bert = torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
model_bert.to(device_bert)

In [None]:
model_bert.train()
for epoch in range(2):  # Increase to 3-5 for better accuracy
    loop = tqdm(train_loader_bert, leave=True)
    for batch in loop:
        inputs_batch = {key: val.to(device_bert) for key, val in batch[0].items()}
        labels_batch = batch[1].to(device_bert)

        optimizer_bert.zero_grad()
        outputs_batch = model_bert(**inputs_batch, labels=labels_batch)
        loss_batch = outputs_batch.loss
        loss_batch.backward()
        optimizer_bert.step()

        loop.set_description(f'Epoch {epoch+1}')
        loop.set_postfix(loss=loss_batch.item())

In [None]:
model_bert.eval()
all_preds_bert = []

with torch.no_grad():
    for batch in test_loader_bert:
        inputs_batch = {key: val.to(device_bert) for key, val in batch[0].items()}
        outputs_batch = model_bert(**inputs_batch)
        preds_batch = torch.argmax(outputs_batch.logits, dim=1)
        all_preds_bert.extend(preds_batch.cpu().numpy())

accuracy_bert = accuracy_score(y_test_enc_bert, all_preds_bert)
print(f"\n=== DistilBERT Accuracy: {accuracy_bert:.2%} ===\n")
print(classification_report(y_test_enc_bert, all_preds_bert, target_names=label_encoder_bert.classes_, zero_division=0))