# Task C: Document Classification

1. Imports and Setup

In [1]:
import os
import re
import numpy as np
import pandas as pd
from collections import Counter

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Setup complete.")

Setup complete.


2. Data Loading and Preprocessing

In [2]:
def preprocess_text(text):
    """Cleans and tokenizes text."""
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    return " ".join([word for word in tokens if word not in stop_words and len(word) > 2])

def load_labeled_data(folder_path):
    """Loads documents and their corresponding labels."""
    all_docs = []
    all_labels = []
    print(f"Loading data from: {folder_path}")
    for category in os.listdir(folder_path):
        category_path = os.path.join(folder_path, category)
        if not os.path.isdir(category_path):
            continue
        for filename in os.listdir(category_path):
            file_path = os.path.join(category_path, filename)
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                all_docs.append(preprocess_text(f.read()))
                all_labels.append(category)
    return all_docs, all_labels

BBC_FOLDER = "bbc"  
documents, labels = load_labeled_data(BBC_FOLDER)

print(f"\nLoaded {len(documents)} documents.")
print(f"Example document: '{documents[0][:100]}...'")
print(f"Label for example document: {labels[0]}")
print(f"Class distribution: {Counter(labels)}")

Loading data from: bbc

Loaded 2225 documents.
Example document: 'sales boost time warner profit quarterly profits media giant timewarner jumped three months december...'
Label for example document: business
Class distribution: Counter({'sport': 511, 'business': 510, 'politics': 417, 'tech': 401, 'entertainment': 386})


3. Feature Extraction (TF-IDF)

In [3]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(documents)  
y = np.array(labels)

print(f"Shape of the TF-IDF feature matrix: {X.shape}")

Shape of the TF-IDF feature matrix: (2225, 5000)


4. Data Splitting & CV Setup

In [4]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

print(f"Training set size: {X_train.shape[0]} documents")
print(f"Test set size: {X_test.shape[0]} documents")

cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

Training set size: 1780 documents
Test set size: 445 documents


5. Helper Function

In [7]:
def evaluate_model(model, X, y, cv):
    """Performs 5-fold cross-validation and prints results."""
    scoring_metrics = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
    scores = cross_validate(model, X, y, cv=cv, scoring=scoring_metrics)

    print(" Cross-Validation Results per Fold ")
    for i in range(cv.get_n_splits()):
        print(f"Fold {i+1}:")
        print(f"  Accuracy: {scores['test_accuracy'][i]:.4f}")
        print(f"  Precision: {scores['test_precision_macro'][i]:.4f}")
        print(f"  Recall: {scores['test_recall_macro'][i]:.4f}")
        print(f"  F1-Score: {scores['test_f1_macro'][i]:.4f}")

    avg_scores = {
        'Avg Accuracy': np.mean(scores['test_accuracy']),
        'Avg Precision': np.mean(scores['test_precision_macro']),
        'Avg Recall': np.mean(scores['test_recall_macro']),
        'Avg F1-Score': np.mean(scores['test_f1_macro']),
    }

    print("\n Average Cross-Validation Scores ")
    for metric, value in avg_scores.items():
        print(f"{metric}: {value:.4f}")

    return avg_scores

6. Task C.a: Logistic Regression

In [8]:
print("\n Evaluating Logistic Regression ")
lr_classifier = LogisticRegression(random_state=42, max_iter=1000)
lr_avg_scores = evaluate_model(lr_classifier, X_train, y_train, cv_strategy)

print("\n Final Performance on Hold-Out Test Set ")
lr_classifier.fit(X_train, y_train)
y_pred_lr = lr_classifier.predict(X_test)
print(classification_report(y_test, y_pred_lr))


 Evaluating Logistic Regression 
 Cross-Validation Results per Fold 
Fold 1:
  Accuracy: 0.9803
  Precision: 0.9803
  Recall: 0.9796
  F1-Score: 0.9799
Fold 2:
  Accuracy: 0.9663
  Precision: 0.9663
  Recall: 0.9644
  F1-Score: 0.9653
Fold 3:
  Accuracy: 0.9635
  Precision: 0.9644
  Recall: 0.9630
  F1-Score: 0.9636
Fold 4:
  Accuracy: 0.9747
  Precision: 0.9777
  Recall: 0.9731
  F1-Score: 0.9748
Fold 5:
  Accuracy: 0.9860
  Precision: 0.9884
  Recall: 0.9844
  F1-Score: 0.9861

 Average Cross-Validation Scores 
Avg Accuracy: 0.9742
Avg Precision: 0.9754
Avg Recall: 0.9729
Avg F1-Score: 0.9739

 Final Performance on Hold-Out Test Set 
               precision    recall  f1-score   support

     business       1.00      0.98      0.99       102
entertainment       0.99      1.00      0.99        77
     politics       0.99      0.98      0.98        84
        sport       0.99      1.00      1.00       102
         tech       0.99      1.00      0.99        80

     accuracy          

7. Task C.b: SVM Classifier

In [9]:
print("\n Evaluating Support Vector Machine (SVM) ")
svm_classifier = SVC(random_state=42)
svm_avg_scores = evaluate_model(svm_classifier, X_train, y_train, cv_strategy)

print("\n Final Performance on Hold-Out Test Set ")
svm_classifier.fit(X_train, y_train)
y_pred_svm = svm_classifier.predict(X_test)
print(classification_report(y_test, y_pred_svm))


 Evaluating Support Vector Machine (SVM) 
 Cross-Validation Results per Fold 
Fold 1:
  Accuracy: 0.9775
  Precision: 0.9771
  Recall: 0.9765
  F1-Score: 0.9767
Fold 2:
  Accuracy: 0.9719
  Precision: 0.9721
  Recall: 0.9701
  F1-Score: 0.9708
Fold 3:
  Accuracy: 0.9691
  Precision: 0.9685
  Recall: 0.9695
  F1-Score: 0.9688
Fold 4:
  Accuracy: 0.9747
  Precision: 0.9767
  Recall: 0.9729
  F1-Score: 0.9745
Fold 5:
  Accuracy: 0.9860
  Precision: 0.9884
  Recall: 0.9844
  F1-Score: 0.9861

 Average Cross-Validation Scores 
Avg Accuracy: 0.9758
Avg Precision: 0.9766
Avg Recall: 0.9747
Avg F1-Score: 0.9754

 Final Performance on Hold-Out Test Set 
               precision    recall  f1-score   support

     business       1.00      0.98      0.99       102
entertainment       0.99      1.00      0.99        77
     politics       0.98      0.98      0.98        84
        sport       0.99      0.99      0.99       102
         tech       0.99      1.00      0.99        80

     accuracy 

8. Comparison and Discussion

In [13]:
comparison_df = pd.DataFrame(
    [lr_avg_scores, svm_avg_scores],
    index=["Logistic Regression", "SVM"]
)

print("\n Side-by-Side Model Comparison (Average CV Scores) ")
print(comparison_df)

print("\nDiscussion ")
if lr_avg_scores['Avg F1-Score'] > svm_avg_scores['Avg F1-Score']:
    print("  Based on the average F1-score, Logistic Regression performed slightly better.")
elif svm_avg_scores['Avg F1-Score'] > lr_avg_scores['Avg F1-Score']:
    print("  Based on the average F1-score, the Support Vector Machine (SVM) performed slightly better.")
else:
    print("  Both models performed equally well based on the average F1-score.")

print("  Both models achieved very high performance (typically >95% accuracy), "
      "indicating that the BBC news dataset is well-separated and that TF-IDF "
      "features are very effective for this topic classification task.")



 Side-by-Side Model Comparison (Average CV Scores) 
                     Avg Accuracy  Avg Precision  Avg Recall  Avg F1-Score
Logistic Regression      0.974157       0.975409    0.972905      0.973940
SVM                      0.975843       0.976554    0.974666      0.975386

Discussion 
  Based on the average F1-score, the Support Vector Machine (SVM) performed slightly better.
  Both models achieved very high performance (typically >95% accuracy), indicating that the BBC news dataset is well-separated and that TF-IDF features are very effective for this topic classification task.
