# Lab Session 5 - Task C: Document Classification

### 1. Imports and Setup

In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Ensure NLTK data is available
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Setup complete.")

Setup complete.


### 2. Data Loading and Preprocessing

In [2]:
def preprocess_text(text):
    """Cleans and tokenizes text, returning a single string."""
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) 
    tokens = word_tokenize(text)
    return " ".join([word for word in tokens if word not in stop_words and len(word) > 2])

def load_labeled_data(folder_path):
    """Loads documents and their corresponding category labels."""
    all_docs = []
    all_labels = []
    print(f"Loading data from: {folder_path}")
    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category)
        if not os.path.isdir(category_path): continue
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                all_docs.append(preprocess_text(f.read()))
                all_labels.append(category)
    return all_docs, all_labels

BBC_FOLDER = 'bbc'
documents, labels = load_labeled_data(BBC_FOLDER)
print(f"\nLoaded {len(documents)} documents.")
print(f"Class distribution: {Counter(labels)}")

Loading data from: bbc

Loaded 2225 documents.
Class distribution: Counter({'sport': 511, 'business': 510, 'politics': 417, 'tech': 401, 'entertainment': 386})


### 3. Feature Extraction (TF-IDF)

In [3]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(documents) 
y = np.array(labels) 

print(f"Shape of the TF-IDF feature matrix: {X.shape}")

Shape of the TF-IDF feature matrix: (2225, 5000)


### 4. Data Splitting and Cross-Validation Setup

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} documents")
print(f"Test set size: {X_test.shape[0]} documents")

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Training set size: 1780 documents
Test set size: 445 documents


### 5. Task C.a: Train & Evaluate Logistic Regression

In [5]:
def evaluate_model(model, X, y, cv):
    """A helper function to perform 5-fold cross-validation and print results."""
    scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring_metrics)
    
    print("--- Cross-Validation Results per Fold ---")
    for i in range(cv.get_n_splits()):
        print(f"Fold {i+1}: Accuracy={scores['test_accuracy'][i]:.4f}, F1-Score={scores['test_f1_macro'][i]:.4f}")
    
    print("\n--- Average Cross-Validation Scores ---")
    avg_scores = {
        'Avg Accuracy': np.mean(scores['test_accuracy']),
        'Avg Precision': np.mean(scores['test_precision_macro']),
        'Avg Recall': np.mean(scores['test_recall_macro']),
        'Avg F1-Score': np.mean(scores['test_f1_macro'])
    }
    for metric, value in avg_scores.items():
        print(f"{metric}: {value:.4f}")
    
    return avg_scores

print("--- Evaluating Logistic Regression ---")
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)
lr_avg_scores = evaluate_model(lr_classifier, X_train, y_train, cv_strategy)

--- Evaluating Logistic Regression ---
--- Cross-Validation Results per Fold ---
Fold 1: Accuracy=0.9803, F1-Score=0.9799
Fold 2: Accuracy=0.9663, F1-Score=0.9653
Fold 3: Accuracy=0.9635, F1-Score=0.9636
Fold 4: Accuracy=0.9747, F1-Score=0.9748
Fold 5: Accuracy=0.9860, F1-Score=0.9861

--- Average Cross-Validation Scores ---
Avg Accuracy: 0.9742
Avg Precision: 0.9754
Avg Recall: 0.9729
Avg F1-Score: 0.9739


### 6. Task C.b: Train & Evaluate Support Vector Machine (SVM)

In [6]:
print("--- Evaluating Support Vector Machine (SVM) ---")
svm_classifier = SVC(random_state=42)
svm_avg_scores = evaluate_model(svm_classifier, X_train, y_train, cv_strategy)

--- Evaluating Support Vector Machine (SVM) ---
--- Cross-Validation Results per Fold ---
Fold 1: Accuracy=0.9775, F1-Score=0.9767
Fold 2: Accuracy=0.9719, F1-Score=0.9708
Fold 3: Accuracy=0.9691, F1-Score=0.9688
Fold 4: Accuracy=0.9747, F1-Score=0.9745
Fold 5: Accuracy=0.9860, F1-Score=0.9861

--- Average Cross-Validation Scores ---
Avg Accuracy: 0.9758
Avg Precision: 0.9766
Avg Recall: 0.9747
Avg F1-Score: 0.9754


### 7. Compare Models and Discuss

We now directly compare the average scores from both models to fulfill the final requirement of the task.

In [7]:
comparison_df = pd.DataFrame([lr_avg_scores, svm_avg_scores],
                             index=['Logistic Regression', 'SVM'])

print("--- Side-by-Side Model Comparison (Average CV Scores) ---")
display(comparison_df)

print("\n--- Discussion ---")
best_f1_model = comparison_df['Avg F1-Score'].idxmax()
print(f"Based on the average F1-score, the {best_f1_model} performed slightly better. ")
print("Both models achieved excellent performance (>97% accuracy), suggesting that TF-IDF features are highly effective for this topic classification task.")
print("The slight edge for SVM is common in high-dimensional text classification, as SVMs are powerful at finding optimal separating hyperplanes in such feature spaces.")

--- Side-by-Side Model Comparison (Average CV Scores) ---


Unnamed: 0,Avg Accuracy,Avg Precision,Avg Recall,Avg F1-Score
Logistic Regression,0.974157,0.975409,0.972905,0.97394
SVM,0.975843,0.976554,0.974666,0.975386



--- Discussion ---
Based on the average F1-score, the SVM performed slightly better. 
Both models achieved excellent performance (>97% accuracy), suggesting that TF-IDF features are highly effective for this topic classification task.
The slight edge for SVM is common in high-dimensional text classification, as SVMs are powerful at finding optimal separating hyperplanes in such feature spaces.


### 8. Save Final Models for Future Use


In [8]:
lr_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('logistic_regression_model.pkl', 'wb') as f:
    pickle.dump(lr_classifier, f)

with open('svm_model.pkl', 'wb') as f:
    pickle.dump(svm_classifier, f)

print("Vectorizer and both final models have been saved successfully to disk.")

Vectorizer and both final models have been saved successfully to disk.
