Task 1: Loading Documents and Computing TF, IDF, TF-IDF 

In [2]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [3]:
def load_documents_from_directory(directory_path):
    documents = []
    filenames = []
    try:
        file_list = sorted(os.listdir(directory_path), key=lambda x: int(re.search(r'(\d+)', x).group(1)))
    except (IOError, AttributeError):
        print(f"Error: Could not find or read files in directory '{directory_path}'.")
        return [], []

    for filename in file_list:
        if filename.endswith(".txt"):
            filepath = os.path.join(directory_path, filename)
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                documents.append(f.read())
                filenames.append(filename)
    return documents, filenames

bbc_folder_path = 'BBC'
corpus, doc_names = load_documents_from_directory(bbc_folder_path)


In [4]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\b[a-z0-9._%+-]+@[a-z0-9.-]+\.[a-z]{2,}\b', 'EMAIL', text)
    text = re.sub(r'\b\d{4}[-/]\d{2}[-/]\d{2}\b', 'DATE', text)
    text = re.sub(r'\b\d+[\d.,-]*\b', 'NUM', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    return tokens

if corpus:
    preprocessed_corpus = [preprocess_text(doc) for doc in corpus]
    print("--- Preprocessing Complete ---")
    print(f"Loaded and preprocessed {len(preprocessed_corpus)} documents.")
    print(f"Example tokens from first document ({doc_names[0]}): {preprocessed_corpus[0][:20]}\n")
else:
    preprocessed_corpus = []

if preprocessed_corpus:
    all_tokens = [token for doc_tokens in preprocessed_corpus for token in doc_tokens]
    vocabulary = sorted(list(set(all_tokens)))
    vocabulary_size = len(vocabulary)
    print(f"--- Vocabulary Size ---")
    print(f"Total number of unique words (vocabulary size): {vocabulary_size}\n")
else:
    vocabulary = []
    vocabulary_size = 0


--- Preprocessing Complete ---
Loaded and preprocessed 20 documents.
Example tokens from first document (001.txt): ['claxton', 'hunting', 'first', 'major', 'medal', 'british', 'hurdler', 'sarah', 'claxton', 'is', 'confident', 'she', 'can', 'win', 'her', 'first', 'major', 'medal', 'at', 'next']

--- Vocabulary Size ---
Total number of unique words (vocabulary size): 1247



In [5]:
def compute_tf(preprocessed_corpus):
    tf_scores = []
    for doc_tokens in preprocessed_corpus:
        doc_len = len(doc_tokens)
        if doc_len == 0:
            tf_scores.append({})
            continue
        word_counts = Counter(doc_tokens)
        tf_doc = {word: count / doc_len for word, count in word_counts.items()}
        tf_scores.append(tf_doc)
    return tf_scores

def compute_idf(preprocessed_corpus, vocabulary):
    if not preprocessed_corpus or not vocabulary:
        return {}
    N = len(preprocessed_corpus)
    df = {word: 0 for word in vocabulary}
    for word in vocabulary:
        for doc_tokens in preprocessed_corpus:
            if word in doc_tokens:
                df[word] += 1
    idf_scores = {word: np.log(N / count) for word, count in df.items() if count > 0}
    return idf_scores

def compute_tfidf(tf_scores, idf_scores):
    tfidf_scores = []
    for doc_tf in tf_scores:
        tfidf_doc = {word: tf * idf_scores.get(word, 0) for word, tf in doc_tf.items()}
        tfidf_scores.append(tfidf_doc)
    return tfidf_scores


In [6]:
if preprocessed_corpus:
    tf_scores_custom = compute_tf(preprocessed_corpus)
    idf_scores_custom = compute_idf(preprocessed_corpus, vocabulary)
    tfidf_scores_custom = compute_tfidf(tf_scores_custom, idf_scores_custom)

    df_tfidf_custom = pd.DataFrame(tfidf_scores_custom).fillna(0)
    df_tfidf_custom.index = doc_names
    print("--- Custom TF-IDF Matrix (Top 10 words) ---")
    print(df_tfidf_custom.iloc[:10, :10])
    print("\n")

    processed_docs_str = [' '.join(tokens) for tokens in preprocessed_corpus]
    tfidf_vec = TfidfVectorizer(smooth_idf=True, norm='l2')
    tfidf_sklearn_matrix = tfidf_vec.fit_transform(processed_docs_str)
    
    df_tfidf_sklearn = pd.DataFrame(tfidf_sklearn_matrix.toarray(), columns=tfidf_vec.get_feature_names_out(), index=doc_names)
    print("--- Scikit-learn TF-IDF Matrix (Top 10 words) ---")
    sklearn_vocab = tfidf_vec.get_feature_names_out()
    common_vocab_slice = [word for word in df_tfidf_custom.columns[:10] if word in sklearn_vocab]
    if common_vocab_slice:
        print(df_tfidf_sklearn.iloc[:10][common_vocab_slice])
    else:
        print("No common vocabulary in the slice to display.")
    print("\n")
    
    df_idf_sklearn = pd.DataFrame(tfidf_vec.idf_, index=tfidf_vec.get_feature_names_out(), columns=['Sklearn IDF'])
    df_idf_custom = pd.DataFrame.from_dict(idf_scores_custom, orient='index', columns=['Custom IDF'])
    df_idf_compare = df_idf_custom.join(df_idf_sklearn).reindex(df_idf_sklearn.index).head(10)
    print("--- IDF Score Comparison (Custom vs. Sklearn) ---")
    print("Note: Sklearn uses a smoothed formula: log((N+1)/(df+1)) + 1")
    print(df_idf_compare)
    print("\n")

    doc_to_check = doc_names[0]
    top_words_custom = df_tfidf_custom.loc[doc_to_check].sort_values(ascending=False).head(10)
    top_words_sklearn = df_tfidf_sklearn.loc[doc_to_check].sort_values(ascending=False).head(10)
    print(f"--- Top 10 TF-IDF Words for Document: {doc_to_check} ---")
    print("\nCustom Implementation:")
    print(top_words_custom)
    print("\nScikit-learn Implementation:")
    print(top_words_sklearn)
else:
    print("Execution halted because no documents were loaded.")


--- Custom TF-IDF Matrix (Top 10 words) ---
          claxton   hunting     first     major     medal   british   hurdler  \
001.txt  0.099858  0.014265  0.022992  0.018068  0.019997  0.009998  0.010965   
002.txt  0.000000  0.000000  0.011177  0.000000  0.000000  0.000000  0.000000   
003.txt  0.000000  0.000000  0.000000  0.000000  0.002792  0.000000  0.000000   
004.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
005.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
006.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
007.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
008.txt  0.000000  0.000000  0.000000  0.000000  0.000000  0.004486  0.000000   
009.txt  0.000000  0.000000  0.004191  0.000000  0.002734  0.002734  0.005996   
010.txt  0.000000  0.000000  0.000000  0.000000  0.004199  0.004199  0.000000   

            sarah        is  confident  
001.txt  0.014265  0.00