Task 2: Stemming and Lemmatization of the documents using NLTK.

In [4]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2025.7.34-cp310-cp310-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   -------------------- ------------------- 0.8/1.5 MB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 1.5/1.5 MB 4.4 MB/s eta 0:00:00
Downloading regex-2025.7.34-cp310-cp310-win_amd64.whl (276 kB)
Installing collected packages: regex, nltk

   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [n

In [18]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [19]:
import nltk
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/wordnet')
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('punkt', quiet=True)
    nltk.download('wordnet', quiet=True)
    nltk.download('omw-1.4', quiet=True)


In [20]:
def load_documents_from_directory(directory_path):
    documents = []
    filenames = []
    try:
        file_list = sorted(os.listdir(directory_path), key=lambda x: int(re.search(r'(\d+)', x).group(1)))
    except (IOError, AttributeError, FileNotFoundError):
        print(f"Error: Could not find or read files in directory '{directory_path}'.")
        return [], []
    for filename in file_list:
        if filename.endswith(".txt"):
            filepath = os.path.join(directory_path, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                documents.append(f.read())
                filenames.append(filename)
    return documents, filenames

def base_preprocess(text):
    text = text.lower()
    text = re.sub(r'\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b', 'EMAIL', text)
    text = re.sub(r'\b\d{4}[-/]\d{2}[-/]\d{2}\b', 'DATE', text)
    text = re.sub(r'\b\d+[\d.,-]*\b', 'NUM', text)
    text = re.sub(r'[^\w\s]', '', text)
    return text.split()

def compute_tf(preprocessed_corpus):
    tf_scores = []
    for doc_tokens in preprocessed_corpus:
        doc_len = len(doc_tokens)
        if doc_len == 0:
            tf_scores.append({})
            continue
        word_counts = Counter(doc_tokens)
        tf_doc = {word: count / doc_len for word, count in word_counts.items()}
        tf_scores.append(tf_doc)
    return tf_scores

def compute_idf(preprocessed_corpus, vocabulary):
    if not preprocessed_corpus or not vocabulary:
        return {}
    N = len(preprocessed_corpus)
    df = {word: 0 for word in vocabulary}
    for word in vocabulary:
        for doc_tokens in preprocessed_corpus:
            if word in doc_tokens:
                df[word] += 1
    idf_scores = {word: np.log(N / count) for word, count in df.items() if count > 0}
    return idf_scores

def compute_tfidf(tf_scores, idf_scores):
    tfidf_scores = []
    for doc_tf in tf_scores:
        tfidf_doc = {word: tf * idf_scores.get(word, 0) for word, tf in doc_tf.items()}
        tfidf_scores.append(tfidf_doc)
    return tfidf_scores


In [21]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_with_stemming(text):
    tokens = base_preprocess(text)
    return [stemmer.stem(token) for token in tokens]

def preprocess_with_lemmatization(text):
    tokens = base_preprocess(text)
    return [lemmatizer.lemmatize(token) for token in tokens]

In [22]:
bbc_folder_path = 'BBC'
corpus, doc_names = load_documents_from_directory(bbc_folder_path)

if corpus:
    original_processed_corpus = [base_preprocess(doc) for doc in corpus]
    stemmed_corpus = [preprocess_with_stemming(doc) for doc in corpus]
    lemmatized_corpus = [preprocess_with_lemmatization(doc) for doc in corpus]

    print("Preprocessing complete.\n")

    vocab_original = sorted(set(token for doc in original_processed_corpus for token in doc))
    vocab_lemmatized = sorted(set(token for doc in lemmatized_corpus for token in doc))
    vocab_stemmed = sorted(set(token for doc in stemmed_corpus for token in doc))

    print("Vocabulary Size Comparison:")
    print(f"Original:     {len(vocab_original)}")
    print(f"Lemmatized:   {len(vocab_lemmatized)}")
    print(f"Stemmed:      {len(vocab_stemmed)}\n")

    print("TF-IDF (Lemmatized Corpus):")
    tf_lemmatized = compute_tf(lemmatized_corpus)
    idf_lemmatized = compute_idf(lemmatized_corpus, vocab_lemmatized)
    tfidf_lemmatized = compute_tfidf(tf_lemmatized, idf_lemmatized)
    df_tfidf_lemmatized = pd.DataFrame(tfidf_lemmatized).fillna(0)
    df_tfidf_lemmatized.index = doc_names
    print(df_tfidf_lemmatized.iloc[:10, :10])
    print("\n" + "="*50 + "\n")

    print("TF-IDF (Stemmed Corpus):")
    tf_stemmed = compute_tf(stemmed_corpus)
    idf_stemmed = compute_idf(stemmed_corpus, vocab_stemmed)
    tfidf_stemmed = compute_tfidf(tf_stemmed, idf_stemmed)
    df_tfidf_stemmed = pd.DataFrame(tfidf_stemmed).fillna(0)
    df_tfidf_stemmed.index = doc_names
    print(df_tfidf_stemmed.iloc[:10, :10])

else:
    print("No documents loaded.")


Preprocessing complete.

Vocabulary Size Comparison:
Original:     1247
Lemmatized:   1172
Stemmed:      1078

TF-IDF (Lemmatized Corpus):
          claxton   hunting     first     major     medal   british   hurdler  \
001.txt  0.099858  0.014265  0.022992  0.018068  0.019997  0.009998  0.010965   
002.txt  0.000000  0.000000  0.011177  0.000000  0.000000  0.000000  0.000000   
003.txt  0.000000  0.000000  0.000000  0.000000  0.002792  0.000000  0.000000   
004.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
005.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
006.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
007.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
008.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.004486  0.000000   
009.txt  0.000000  0.000000  0.004191  0.000000  0.002734  0.002734  0.005996   
010.txt  0.000000  0.000000  0.000000  0.000000  0.