In [48]:
#libraries import

import os
import re
import math
from math import sqrt
import matplotlib.pyplot as plt
from collections import defaultdict

In [78]:
#all the queries in a list
QUERIES = [
    "How effective are inhalations of mucolytic agents in the treatment of CF patients",
    "What is the role of aerosols in the treatment of lung disease in CF patients",
    "What is the role of bacterial phagocytosis by alveolar macrophages or polymorphonuclear leukocytes in lung disease in CF patients",
    "What is the relationship between Haemophilus influenzae and Pseudomonas aeruginosa in CF patients",
    "Do CF patients ever develop infection in organs other than the lung If so in what organs",
    "What is the role of bacteria other than Pseudomonas aeruginosa Staphylococcus aureus or Haemophilus influenzae in the pathogenesis of lung disease in CF patients of lung disease in CF patients",
    "What is the role of fungi in the pathogenesis of lung disease in CF patients",
    "What is the role of viral infection in the lung disease of CF patients",
    "What is the epidemiology of Pseudomonas aeruginosa in CF patients",
    "What factors are responsible for the appearance of mucoid strains of Pseudomonas aeruginosa in CF patients",
    "What are the unusual manifestations of CF other than lung disease or exocrine pancreatic insufficiency",
    "What is the prognosis for survival of patients with CF",
    "Do CF patients have normal intelligence",
    "What animal models are available which are relevant to CF",
    "What abnormalities of skeletal muscle function or structure have been found in CF patients",
    "Is there an increased incidence of dental problems eg caries or periodontal disease in CF patients",
    "Is oxygen transport by red blood cells abnormal in CF patients",
    "What are the effects of CF on the development and or function of the brain and central nervous system",
    "Are there abnormalities of taste in CF patients",
    "What is the incidence of and treatment for hypertrophic osteoarthropathy in CF patients"
]


In [79]:
#relevant.txt file locating
base_dir = os.getcwd()
docs_folder = os.path.join(base_dir, "docs")
relevant_file_path = os.path.join(base_dir, "Relevant.txt")


In [51]:
#stopwords that are gonna be ignored afterwards
stopwords = {
    "a", "an", "the", "this", "that", "these", "those", "such",
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself",
    "she", "her", "hers", "herself",
    "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves",
    "one", "ones", "someone", "somebody", "everyone", "everybody", "noone", "nobody", "anyone", "anybody",
    "anything", "everything", "nothing", "something",
    "am", "is", "are", "was", "were", "be", "been", "being",
    "do", "does", "did", "doing", "have", "has", "had", "having",
    "can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought",
    "not", "no", "nor", "neither", "none", "never", "without",
    "and", "or", "but", "if", "while", "although", "though", "because", "since", "so", "than",
    "as", "at", "by", "for", "from", "in", "into", "of", "off", "on", "out", "over", "through", "to", "toward",
    "under", "until", "up", "upon", "with", "within", "between", "before", "after", "during", "about", "around",
    "also", "very", "too", "just", "even", "still", "yet", "already", "here", "there", "when", "where",
    "how", "why", "once", "then", "ever", "always", "often", "sometimes", "rarely", "seldom",
    "all", "any", "both", "each", "either", "few", "many", "much", "more", "most", "some", "several",
    "various", "enough", "less", "least", "lot", "lots", "plenty",
    "now", "today", "tonight", "tomorrow", "yesterday", "ago", "soon", "later",
    "therefore", "thus", "however", "hence", "meanwhile",
    "okay", "ok", "yes", "no", "well", "oh", "hey", "hi", "hello", "thanks", "thank", "please",
    "like", "just", "really", "actually", "basically", "literally",
    "aren't", "isn't", "wasn't", "weren't", "hasn't", "haven't", "hadn't",
    "doesn't", "don't", "didn't", "won't", "wouldn't", "can't", "couldn't", "shouldn't", "mustn't",
    "i'm", "you're", "he's", "she's", "it's", "we're", "they're",
    "i've", "you've", "we've", "they've",
    "i'll", "you'll", "he'll", "she'll", "we'll", "they'll",
    "i'd", "you'd", "he'd", "she'd", "we'd", "they'd",
    "that's", "there's", "what's", "who's", "where's", "when's", "why's", "how's",
    "let's", "ain't", "y'all", "ya", "gonna", "wanna", "gotta",
    "own", "same", "else", "per", "via", "etc", "among", "beside", "despite",
    "towards", "regarding", "including", "excluding", "whether", "whichever", "whatever",
}


In [52]:
#remove stopwords
def preprocess(word):
    w = re.sub(r"[^a-z]", "", word.lower())
    if len(w) > 1 and w not in stopwords:
        return w
    return None

#append each word in a form of documents[doc_id] = words(including duplicates)

def load_and_preprocess_documents():
    documents = {}
    for doc_id in range(1, 1240):
        file_path = os.path.join(docs_folder, f"{doc_id:05d}")
        if os.path.exists(file_path):
            with open(file_path, 'r', encoding='utf-8') as f:
                raw_words = [line.strip().lower() for line in f if line.strip()]
        else:
            raw_words = []

        cleaned_words = []
        for w in raw_words:
            p = preprocess(w)
            if p:
                cleaned_words.append(p)
        documents[doc_id] = cleaned_words

    print(f"Loaded and preprocessed {len(documents)} documents.")
    return documents

In [53]:
#custom inverted index
def build_inverted_index(documents):
    inverted_index = defaultdict(lambda: defaultdict(int))
    for doc_id, words in documents.items():
        for w in words:
            inverted_index[w][doc_id] += 1
    print("Inverted index created.")
    return inverted_index

#term frequency definition
def TF(inverted_index, word, doc):
    return inverted_index.get(word, {}).get(doc, 0)

#inverse document frequency
def IDF(inverted_index, word, N_docs):
    df = len(inverted_index.get(word, {}))
    return math.log10(N_docs / df) if df else 0.0


def TF_IDF(inverted_index, word, doc, N_docs):
    return TF(inverted_index, word, doc) * IDF(inverted_index, word, N_docs)

#tfidfindex: same as documents[doc_id] but with each distinct word as a tfidf value
def build_tf_idf_index(inverted_index, documents):
    N_docs = len(documents)
    TF_IDF_INDEX = defaultdict(lambda: defaultdict(float))
    for term, postings in inverted_index.items():
        for doc_id in postings.keys():
            TF_IDF_INDEX[term][doc_id] = TF_IDF(inverted_index, term, doc_id, N_docs)
    print("TF-IDF index created.")
    return TF_IDF_INDEX

#normalizetion of tfidf values in TF_IDF_INDEX, stored seperately. L2 Normalization
def build_doc_norms(TF_IDF_INDEX):
    DOC_NORM = defaultdict(float)
    for term, postings in TF_IDF_INDEX.items():
        for d, w in postings.items():
            DOC_NORM[d] += w * w
    for d in DOC_NORM:
        DOC_NORM[d] = sqrt(DOC_NORM[d]) or 1.0
    print("Document norms computed.")
    return DOC_NORM

In [54]:
#preprocess a query and convert it into a tfidf vector + normalization
def _query_vector(terms, TF_IDF_INDEX, inverted_index, N_docs, weight="idf"):
    q = defaultdict(float)
    for t in terms:
        t_clean = preprocess(t)
        if t_clean and t_clean in TF_IDF_INDEX:
            if weight == "idf":
                q[t_clean] += IDF(inverted_index, t_clean, N_docs)
            else:
                q[t_clean] += 1.0
    if not q:
        return {}
    norm = math.sqrt(sum(v * v for v in q.values())) or 1.0
    return {t: v / norm for t, v in q.items()}


#calculate cosine similarity between document vector and query vector. for every document
def rank_query(terms, TF_IDF_INDEX, DOC_NORM, inverted_index, documents, k=100, weight="idf"):
    N_docs = len(documents)
    qv = _query_vector(terms, TF_IDF_INDEX, inverted_index, N_docs, weight=weight)
    if not qv:
        return []

    scores = defaultdict(float)
    for t, qt in qv.items():
        for d, wt in TF_IDF_INDEX.get(t, {}).items():
            scores[d] += qt * wt

    for d in scores:
        scores[d] /= DOC_NORM[d]

    ranked = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return ranked if k is None else ranked[:k]

#relevant docs loading. (20 lines)
def load_all_relevant_lists():
    all_rels = []
    if not os.path.exists(relevant_file_path):
        print("File not found:", relevant_file_path)
        return all_rels

    with open(relevant_file_path, "r", encoding="utf-8") as f:
        lines = [line.strip() for line in f if line.strip()]

    for line in lines:
        words = line.split()
        nums = [int(x) for x in words]
        all_rels.append(nums)

    print(f"Loaded {len(all_rels)} relevance sets from Relevant.txt")
    return all_rels

In [55]:
def TP(list_a, list_b):
    return len(set(list_a).intersection(set(list_b)))


def FP(list_a, list_b):
    return len(set(list_a).difference(set(list_b)))


def FN(list_a, list_b):
    return len(set(list_b).difference(set(list_a)))


def precision(retrieved, relevant):
    tp = TP(retrieved, relevant)
    fp = FP(retrieved, relevant)
    return tp / (tp + fp) if (tp + fp) else 0.0


def recall(retrieved, relevant):
    tp = TP(retrieved, relevant)
    fn = FN(retrieved, relevant)
    return tp / (tp + fn) if (tp + fn) else 0.0

#precision@k definition
def precision_at_k(ranked, relevant, k):
    if k <= 0:
        return 0.0
    topk = ranked[:k]
    return TP(topk, relevant) / k

#@recall@k definition
def recall_at_k(ranked, relevant, k):
    if not relevant or k <= 0:
        return 0.0
    topk = ranked[:k]
    return TP(topk, relevant) / len(relevant)

#f1score definition
def f1_score(p, r):
    return 0.0 if (p + r) == 0 else 2 * p * r / (p + r)


def f1_at_k(ranked, relevant, k):
    p = precision_at_k(ranked, relevant, k)
    r = recall_at_k(ranked, relevant, k)
    return f1_score(p, r)


def average_precision(ranked, relevant):
    rel = set(relevant)
    if not rel:
        return 0.0
    tp = 0
    ap = 0.0
    for i, d in enumerate(ranked, 1):
        if d in rel:
            tp += 1
            ap += tp / i
    return ap / len(rel)


In [56]:

#evaluates a single query, finding the cosine similarity and returning the k most relevant docs(k = |len(relevant.txt.line(num of query))|)
def evaluate_single_query(query, relevant_docs, documents, inverted_index, TF_IDF_INDEX, DOC_NORM):
    if not relevant_docs:
        return None

    k = len(relevant_docs)

    terms = query.split()
    results = rank_query(terms, TF_IDF_INDEX, DOC_NORM, inverted_index, documents, k=100)
    ranked = [int(doc_id) for doc_id, _ in results]

    # compute metrics at k
    p_at_k = precision_at_k(ranked, relevant_docs, k)
    r_at_k = recall_at_k(ranked, relevant_docs, k)
    f1_k = f1_at_k(ranked, relevant_docs, k)
    
    # overall metrics (use all retrieved documents)
    p_overall = precision(ranked, relevant_docs)
    r_overall = recall(ranked, relevant_docs)
    f1_overall = f1_score(p_overall, r_overall)
    
    ap = average_precision(ranked, relevant_docs)
    
    return {
        "k": k,
        "precision_at_k": p_at_k,
        "recall_at_k": r_at_k,
        "f1_at_k": f1_k,
        "precision_overall": p_overall,
        "recall_overall": r_overall,
        "f1_overall": f1_overall,
        "average_precision": ap
    }

In [57]:
#run the evaluate_single_query function for all queries
def run_all_queries_and_average():
    documents = load_and_preprocess_documents()
    inverted_index = build_inverted_index(documents)
    TF_IDF_INDEX = build_tf_idf_index(inverted_index, documents)
    DOC_NORM = build_doc_norms(TF_IDF_INDEX)

    all_rels = load_all_relevant_lists()
    if not all_rels:
        print("No relevance data found. Stopping.")
        return

    num_queries = min(len(QUERIES), len(all_rels))
    if len(QUERIES) != len(all_rels):
        print(
            f"Warning: {len(QUERIES)} queries but {len(all_rels)} relevance lines. "
            f"Using first {num_queries} pairs."
        )

    #variables for computing avg metrics for all queries afterwards
    sum_precision_at_k = 0.0
    sum_recall_at_k = 0.0
    sum_f1_at_k = 0.0
    sum_precision_overall = 0.0
    sum_recall_overall = 0.0
    sum_f1_overall = 0.0
    sum_ap = 0.0
    valid_count = 0

    for i in range(num_queries):
        query = QUERIES[i]
        rel_docs = all_rels[i]

        print("\n" + "="*60)
        print(f"Query #{i+1}")
        print(f"Text: {query}")
        print(f"Relevant docs: {rel_docs}")

        metrics = evaluate_single_query(
            query,
            rel_docs,
            documents,
            inverted_index,
            TF_IDF_INDEX,
            DOC_NORM,
        )

        if metrics is None:
            print("No relevant docs for this query, skipping.")
            continue

        k = metrics["k"]
        print(f"\nk (|relevant|) = {k}")
        print(f"Precision@{k}      = {metrics['precision_at_k']:.4f}")
        print(f"Recall@{k}         = {metrics['recall_at_k']:.4f}")
        print(f"F1@{k}             = {metrics['f1_at_k']:.4f}")
        print(f"Precision (overall) = {metrics['precision_overall']:.4f}")
        print(f"Recall (overall)    = {metrics['recall_overall']:.4f}")
        print(f"F1 (overall)        = {metrics['f1_overall']:.4f}")
        print(f"Average Precision   = {metrics['average_precision']:.4f}")

        sum_precision_at_k += metrics["precision_at_k"]
        sum_recall_at_k += metrics["recall_at_k"]
        sum_f1_at_k += metrics["f1_at_k"]
        sum_precision_overall += metrics["precision_overall"]
        sum_recall_overall += metrics["recall_overall"]
        sum_f1_overall += metrics["f1_overall"]
        sum_ap += metrics["average_precision"]
        valid_count += 1

    if valid_count == 0:
        print("No valid queries evaluated.")
        return

    # compute averages
    print("\n" + "="*60)
    print(f"AVERAGE METRICS OVER {valid_count} QUERIES")
    print("="*60)
    print(f"Avg Precision@k      = {sum_precision_at_k / valid_count:.4f}")
    print(f"Avg Recall@k         = {sum_recall_at_k / valid_count:.4f}")
    print(f"Avg F1@k             = {sum_f1_at_k / valid_count:.4f}")
    print(f"Avg Precision (all)  = {sum_precision_overall / valid_count:.4f}")
    print(f"Avg Recall (all)     = {sum_recall_overall / valid_count:.4f}")
    print(f"Avg F1 (all)         = {sum_f1_overall / valid_count:.4f}")
    print(f"Mean Average Prec    = {sum_ap / valid_count:.4f}")

In [58]:
#printing all ranked cosines(used for report)
def print_ranked_cosines(query, TF_IDF_INDEX, DOC_NORM, inverted_index, documents, top_n=50, weight="idf"):
    terms = query.split()
    ranked = rank_query(
        terms, TF_IDF_INDEX, DOC_NORM, inverted_index, documents,
        k=100, weight=weight
    )

    if not ranked:
        print("No results (query vector empty after preprocessing).")
        return

    print(f"Query: {query}")
    print(f"Total ranked docs: {len(ranked)}")
    print(f"Showing top {min(top_n, len(ranked))}:\n")

    for i, (doc_id, score) in enumerate(ranked[:top_n], 1):
        print(f"{i:4d}. doc_id={doc_id:5d}   cosine={score:.6f}")


def print_doc_vector(tf_idf_index, doc_id, top_k=None):
    try:
        doc_id = int(doc_id)
    except (TypeError, ValueError):
        pass

    vec = {term: postings[doc_id]
           for term, postings in tf_idf_index.items()
           if doc_id in postings}

    if not vec:
        print(f"No terms found for doc_id={doc_id}")
        return

    items = sorted(vec.items(), key=lambda x: x[1], reverse=True)
    if top_k is not None:
        items = items[:top_k]

    print(f"TF-IDF vector for doc_id={doc_id} (total terms={len(vec)}):")
    for term, w in items:
        print(f"  {term:25s} {w:.6f}")


def plot_ranked_cosines(query, TF_IDF_INDEX, DOC_NORM, inverted_index, documents, top_n=100, weight="idf"):
    terms = query.split()
    ranked = rank_query(
        terms, TF_IDF_INDEX, DOC_NORM, inverted_index, documents,
        k=100, weight=weight
    )

    if not ranked:
        print("No results to plot.")
        return

    if top_n is not None:
        ranked = ranked[:top_n]

    scores = [score for _, score in ranked]
    ranks = list(range(1, len(scores) + 1))

    plt.figure(figsize=(10, 6))
    plt.plot(ranks, scores, marker='o', markersize=3)
    plt.xlabel("Rank")
    plt.ylabel("Cosine Similarity")
    plt.title(f"Ranked Cosine Similarities (Top {len(ranked)})")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()


In [59]:
if __name__ == "__main__":
    run_all_queries_and_average()

Loaded and preprocessed 1239 documents.
Inverted index created.
TF-IDF index created.
Document norms computed.
Loaded 20 relevance sets from Relevant.txt

Query #1
Text: How effective are inhalations of mucolytic agents in the treatment of CF patients
Relevant docs: [31, 90, 93, 195, 253, 321, 326, 359, 370, 438, 542, 546, 550, 551, 586, 592, 729, 731, 734, 941, 944, 990, 1092, 1134, 1232, 1234]

k (|relevant|) = 26
Precision@26      = 0.3077
Recall@26         = 0.3077
F1@26             = 0.3077
Precision (overall) = 0.0900
Recall (overall)    = 0.3462
F1 (overall)        = 0.1429
Average Precision   = 0.2086

Query #2
Text: What is the role of aerosols in the treatment of lung disease in CF patients
Relevant docs: [25, 31, 90, 93, 148, 152, 159, 189, 195, 197, 238, 253, 297, 321, 326, 327, 330, 331, 333, 359, 370, 427, 438, 458, 542, 543, 546, 550, 551, 586, 592, 722, 724, 729, 731, 734, 826, 837, 845, 879, 882, 889, 904, 912, 937, 941, 944, 952, 963, 990, 1000, 1003, 1040, 1078, 1092

In [68]:
#logarithmic term frequency alternative testing
def TF(inverted_index, word, doc, max_tf):
    return 1 + math.log10(inverted_index.get(word, {}).get(doc, 0))

In [69]:
if __name__ == "__main__":
    run_all_queries_and_average()

Loaded and preprocessed 1239 documents.
Inverted index created.
Document norms computed.
Loaded 20 relevance sets from Relevant.txt

Query #1
Text: How effective are inhalations of mucolytic agents in the treatment of CF patients
Relevant docs: [31, 90, 93, 195, 253, 321, 326, 359, 370, 438, 542, 546, 550, 551, 586, 592, 729, 731, 734, 941, 944, 990, 1092, 1134, 1232, 1234]

k (|relevant|) = 26
Precision@26      = 0.2308
Recall@26         = 0.2308
F1@26             = 0.2308
Precision (overall) = 0.1000
Recall (overall)    = 0.3846
F1 (overall)        = 0.1587
Average Precision   = 0.1999

Query #2
Text: What is the role of aerosols in the treatment of lung disease in CF patients
Relevant docs: [25, 31, 90, 93, 148, 152, 159, 189, 195, 197, 238, 253, 297, 321, 326, 327, 330, 331, 333, 359, 370, 427, 438, 458, 542, 543, 546, 550, 551, 586, 592, 722, 724, 729, 731, 734, 826, 837, 845, 879, 882, 889, 904, 912, 937, 941, 944, 952, 963, 990, 1000, 1003, 1040, 1078, 1092, 1150, 1188, 1227, 12

In [70]:
#augmented term frequency alternative testing

def compute_max_tf(inverted_index):
    max_tf = defaultdict(int)
    for postings in inverted_index.values():
        for doc_id, tf in postings.items():
            if tf > max_tf[doc_id]:
                max_tf[doc_id] = tf
    return max_tf


def TF(inverted_index, word, doc, max_tf):
    tf = inverted_index.get(word, {}).get(doc, 0)
    if tf == 0:
        return 0.0
    return 0.5 + 0.5 * (tf / max_tf[doc])


def TF_IDF(inverted_index, word, doc, N_docs, max_tf):
    return TF(inverted_index, word, doc, max_tf) * IDF(inverted_index, word, N_docs)

def build_tf_idf_index(inverted_index, documents):
    N_docs = len(documents)
    max_tf = compute_max_tf(inverted_index)

    TF_IDF_INDEX = defaultdict(lambda: defaultdict(float))
    for term, postings in inverted_index.items():
        for doc_id in postings:
            TF_IDF_INDEX[term][doc_id] = TF_IDF(
                inverted_index, term, doc_id, N_docs, max_tf
            )
    return TF_IDF_INDEX


In [71]:
if __name__ == "__main__":
    run_all_queries_and_average()

Loaded and preprocessed 1239 documents.
Inverted index created.
Document norms computed.
Loaded 20 relevance sets from Relevant.txt

Query #1
Text: How effective are inhalations of mucolytic agents in the treatment of CF patients
Relevant docs: [31, 90, 93, 195, 253, 321, 326, 359, 370, 438, 542, 546, 550, 551, 586, 592, 729, 731, 734, 941, 944, 990, 1092, 1134, 1232, 1234]

k (|relevant|) = 26
Precision@26      = 0.2308
Recall@26         = 0.2308
F1@26             = 0.2308
Precision (overall) = 0.1000
Recall (overall)    = 0.3846
F1 (overall)        = 0.1587
Average Precision   = 0.1978

Query #2
Text: What is the role of aerosols in the treatment of lung disease in CF patients
Relevant docs: [25, 31, 90, 93, 148, 152, 159, 189, 195, 197, 238, 253, 297, 321, 326, 327, 330, 331, 333, 359, 370, 427, 438, 458, 542, 543, 546, 550, 551, 586, 592, 722, 724, 729, 731, 734, 826, 837, 845, 879, 882, 889, 904, 912, 937, 941, 944, 952, 963, 990, 1000, 1003, 1040, 1078, 1092, 1150, 1188, 1227, 12

In [75]:
#precision-recall curve points creation to be used for latex pgplot
def precision_recall_curve(ranked_docs, relevant_docs):
    relevant_docs = set(relevant_docs)
    tp = 0
    pr_points = []

    for i, doc_id in enumerate(ranked_docs, start=1):
        if doc_id in relevant_docs:
            tp += 1

        precision = tp / i
        recall = tp / len(relevant_docs)

        pr_points.append((recall, precision))

    return pr_points


In [77]:
#documents stats - used for report
def analyze_collection_statistics(inverted_index, documents):
    all_freqs = []
    max_freq_per_doc = []
    doc_lengths = []
    
    for doc_id, terms in documents.items():
        doc_lengths.append(len(terms))
        
    for term, postings in inverted_index.items():
        for doc_id, freq in postings.items():
            all_freqs.append(freq)
    
    max_tf = defaultdict(int)
    for term, postings in inverted_index.items():
        for doc_id, freq in postings.items():
            if freq > max_tf[doc_id]:
                max_tf[doc_id] = freq
    max_freq_per_doc = list(max_tf.values())
    
    print("="*60)
    print("COLLECTION STATISTICS")
    print("="*60)
    print(f"Total documents: {len(documents)}")
    print(f"Avg doc length: {sum(doc_lengths)/len(doc_lengths):.1f} words")
    print(f"Min doc length: {min(doc_lengths)} words")
    print(f"Max doc length: {max(doc_lengths)} words")
    print()
    print("TERM FREQUENCY DISTRIBUTION:")
    print(f"  Avg TF: {sum(all_freqs)/len(all_freqs):.2f}")
    print(f"  Median TF: {sorted(all_freqs)[len(all_freqs)//2]}")
    print(f"  Max TF: {max(all_freqs)}")
    print(f"  % of TF=1: {100*all_freqs.count(1)/len(all_freqs):.1f}%")
    print(f"  % of TF=2: {100*all_freqs.count(2)/len(all_freqs):.1f}%")
    print(f"  % of TF>5: {100*sum(1 for f in all_freqs if f>5)/len(all_freqs):.1f}%")
    print(f"  % of TF>10: {100*sum(1 for f in all_freqs if f>10)/len(all_freqs):.1f}%")
    print()
    print("MAX FREQUENCY PER DOCUMENT:")
    print(f"  Avg max TF: {sum(max_freq_per_doc)/len(max_freq_per_doc):.2f}")
    print(f"  Median max TF: {sorted(max_freq_per_doc)[len(max_freq_per_doc)//2]}")
    print("="*60)

documents = load_and_preprocess_documents()
inverted_index = build_inverted_index(documents)
analyze_collection_statistics(inverted_index, documents)

Loaded and preprocessed 1239 documents.
Inverted index created.
COLLECTION STATISTICS
Total documents: 1239
Avg doc length: 80.6 words
Min doc length: 0 words
Max doc length: 301 words

TERM FREQUENCY DISTRIBUTION:
  Avg TF: 1.46
  Median TF: 1
  Max TF: 25
  % of TF=1: 75.5%
  % of TF=2: 14.3%
  % of TF>5: 1.5%
  % of TF>10: 0.1%

MAX FREQUENCY PER DOCUMENT:
  Avg max TF: 5.42
  Median max TF: 5
