In [None]:
#required labraries import
import os
import re
from collections import defaultdict
from itertools import product
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#all the queries in a list
QUERIES = [
    "How effective are inhalations of mucolytic agents in the treatment of CF patients",
    "What is the role of aerosols in the treatment of lung disease in CF patients",
    "What is the role of bacterial phagocytosis by alveolar macrophages or polymorphonuclear leukocytes in lung disease in CF patients",
    "What is the relationship between Haemophilus influenzae and Pseudomonas aeruginosa in CF patients",
    "Do CF patients ever develop infection in organs other than the lung If so in what organs",
    "What is the role of bacteria other than Pseudomonas aeruginosa Staphylococcus aureus or Haemophilus influenzae in the pathogenesis of lung disease in CF patients of lung disease in CF patients",
    "What is the role of fungi in the pathogenesis of lung disease in CF patients",
    "What is the role of viral infection in the lung disease of CF patients",
    "What is the epidemiology of Pseudomonas aeruginosa in CF patients",
    "What factors are responsible for the appearance of mucoid strains of Pseudomonas aeruginosa in CF patients",
    "What are the unusual manifestations of CF other than lung disease or exocrine pancreatic insufficiency",
    "What is the prognosis for survival of patients with CF",
    "Do CF patients have normal intelligence",
    "What animal models are available which are relevant to CF",
    "What abnormalities of skeletal muscle function or structure have been found in CF patients",
    "Is there an increased incidence of dental problems eg caries or periodontal disease in CF patients",
    "Is oxygen transport by red blood cells abnormal in CF patients",
    "What are the effects of CF on the development and or function of the brain and central nervous system",
    "Are there abnormalities of taste in CF patients",
    "What is the incidence of and treatment for hypertrophic osteoarthropathy in CF patients"
]


In [None]:
#relevant.txt file path finder
base_dir = os.getcwd() 
docs_folder = os.path.join(base_dir, "docs")
relevant_file_path = os.path.join(base_dir, "Relevant.txt")

In [None]:
#stopwords for removal afterwards
stopwords = {
    "a", "an", "the", "this", "that", "these", "those", "such",
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself",
    "she", "her", "hers", "herself",
    "it", "its", "itself",
    "they", "them", "their", "theirs", "themselves",
    "one", "ones", "someone", "somebody", "everyone", "everybody", "noone", "nobody", "anyone", "anybody",
    "anything", "everything", "nothing", "something",
    "am", "is", "are", "was", "were", "be", "been", "being",
    "do", "does", "did", "doing", "have", "has", "had", "having",
    "can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought",
    "not", "no", "nor", "neither", "none", "never", "without",
    "and", "or", "but", "if", "while", "although", "though", "because", "since", "so", "than",
    "as", "at", "by", "for", "from", "in", "into", "of", "off", "on", "out", "over", "through", "to", "toward",
    "under", "until", "up", "upon", "with", "within", "between", "before", "after", "during", "about", "around",
    "also", "very", "too", "just", "even", "still", "yet", "already", "here", "there", "when", "where",
    "how", "why", "once", "then", "ever", "always", "often", "sometimes", "rarely", "seldom",
    "all", "any", "both", "each", "either", "few", "many", "much", "more", "most", "some", "several",
    "various", "enough", "less", "least", "lot", "lots", "plenty",
    "now", "today", "tonight", "tomorrow", "yesterday", "ago", "soon", "later",
    "therefore", "thus", "however", "hence", "meanwhile",
    "okay", "ok", "yes", "no", "well", "oh", "hey", "hi", "hello", "thanks", "thank", "please",
    "like", "just", "really", "actually", "basically", "literally",
    "aren't", "isn't", "wasn't", "weren't", "hasn't", "haven't", "hadn't",
    "doesn't", "don't", "didn't", "won't", "wouldn't", "can't", "couldn't", "shouldn't", "mustn't",
    "i'm", "you're", "he's", "she's", "it's", "we're", "they're",
    "i've", "you've", "we've", "they've",
    "i'll", "you'll", "he'll", "she'll", "we'll", "they'll",
    "i'd", "you'd", "he'd", "she'd", "we'd", "they'd",
    "that's", "there's", "what's", "who's", "where's", "when's", "why's", "how's",
    "let's", "ain't", "y'all", "ya", "gonna", "wanna", "gotta",
    "own", "same", "else", "per", "via", "etc", "among", "beside", "despite",
    "towards", "regarding", "including", "excluding", "whether", "whichever", "whatever",
}


In [None]:
#stopwords removal
def preprocess(word: str):
    w = re.sub(r"[^a-z]", "", word.lower())
    if len(w) > 1 and w not in stopwords:
        return w
    return None


#read all 1239 documents & append them in list "document"
def load_and_preprocess_documents():
    documents = {}

    for doc_id in range(1, 1240):  # 1..1239
        file_path = os.path.join(docs_folder, f"{doc_id:05d}")
        if os.path.exists(file_path):
            with open(file_path, "r", encoding="utf-8") as f:
                words = [line.strip().lower() for line in f if line.strip()]
        else:
            words = []

        cleaned_words = []
        for w in words:
            p = preprocess(w)
            if p:
                cleaned_words.append(p)
        documents[doc_id] = cleaned_words

    print("Διαβάστηκαν", len(documents), "έγγραφα.")

    doc_ids = []
    texts = []
    for doc_id in sorted(documents.keys()):
        tokens = documents[doc_id]
        text = " ".join(tokens)
        doc_ids.append(doc_id)
        texts.append(text)

    print("Δημιουργήθηκαν", len(doc_ids), "κείμενα για TF-IDF.")
    return doc_ids, texts


#relevant.txt lines loaded in "all_rels" list
def load_all_relevant_lists():
    all_rels = []
    if not os.path.exists(relevant_file_path):
        print("File not found:", relevant_file_path)
        return all_rels

    with open(relevant_file_path, "r", encoding="utf-8") as f:
        lines = [line.strip().lower() for line in f if line.strip()]

    for line in lines:
        words = line.split()
        nums = [int(x) for x in words]
        all_rels.append(nums)

    print("Loaded", len(all_rels), "relevance sets from Relevant.txt")
    return all_rels

In [6]:
def TP(list_a, list_b):
    set_a = set(list_a)
    set_b = set(list_b)
    return len(set_a.intersection(set_b))


def FP(list_a, list_b):
    set_a = set(list_a)
    set_b = set(list_b)
    return len(set_a.difference(set_b))


def FN(list_a, list_b):
    set_a = set(list_a)
    set_b = set(list_b)
    return len(set_b.difference(set_a))


def precision(a, b):
    tp = TP(a, b)
    fp = FP(a, b)
    return tp / (tp + fp) if (tp + fp) else 0.0


def recall(a, b):
    tp = TP(a, b)
    fn = FN(a, b)
    return tp / (tp + fn) if (tp + fn) else 0.0


def precision_at_k(list_a, list_b, k):
    if k <= 0:
        return 0.0
    return TP(list_a[:k], list_b) / k


def F1_SCORE(list_a, list_b):
    p = precision(list_a, list_b)
    r = recall(list_a, list_b)
    return 0.0 if (p + r) == 0 else 2 * p * r / (p + r)


def average_precision(ranked, relevant):
    rel = set(relevant)
    if not rel:
        return 0.0
    tp = 0
    ap = 0.0
    for i, d in enumerate(ranked, 1):
        if d in rel:
            tp += 1
            ap += tp / i
    return ap / len(rel)


def f1_at_k(ranked, relevant, k):
    rel = set(relevant)
    k = min(k, len(ranked))
    if k == 0:
        return 0.0
    topk = ranked[:k]
    tp = sum(1 for d in topk if d in rel)
    p = tp / k
    r = tp / len(rel) if rel else 0.0
    return 0.0 if (p + r) == 0 else 2 * p * r / (p + r)


In [None]:
#vectorizer building using sklearn livrary
def build_vectorizer(params):
    return TfidfVectorizer(
        ngram_range=params["ngram_range"],
        sublinear_tf=params["sublinear_tf"],
        min_df=params["min_df"],
        max_df=params["max_df"],
        norm=params["norm"],
    )

#vectorizing document set & query of the user --> finding the cos. similarity using "metrics" package from sklearn
def rank_documents_with_params(params, query_text, doc_ids, texts):
    vectorizer = build_vectorizer(params)

    X_docs = vectorizer.fit_transform(texts)
    X_query = vectorizer.transform([query_text])

    sims = cosine_similarity(X_query, X_docs)[0]
    ranked_indices = sims.argsort()[::-1]

    ranked_doc_ids = [doc_ids[i] for i in ranked_indices]
    ranked_scores = [sims[i] for i in ranked_indices]
    return ranked_doc_ids, ranked_scores


#run the above function and return all metrics as defined in the previous jupyter cell
def evaluate_single_query_with_params(params, query_text, relevant_docs, doc_ids, texts):
    if not relevant_docs:
        return None

    k = len(relevant_docs)

    ranked_doc_ids, _ = rank_documents_with_params(params, query_text, doc_ids, texts)

    ranked = ranked_doc_ids

    p_at_k = precision_at_k(ranked, relevant_docs, k)
    p = precision(ranked, relevant_docs)
    r = recall(ranked, relevant_docs)
    f1 = F1_SCORE(ranked, relevant_docs)
    ap = average_precision(ranked, relevant_docs)
    f1_k = f1_at_k(ranked, relevant_docs, k)

    return {
        "k": k,
        "precision_at_k": p_at_k,
        "precision": p,
        "recall": r,
        "f1": f1,
        "f1_at_k": f1_k,
        "average_precision": ap,
        "ranked": ranked,
    }


In [None]:
# Grid search across ALL queries: optimize avg F1@k
def grid_search_all_queries():
    doc_ids, texts = load_and_preprocess_documents()

    all_rels = load_all_relevant_lists()
    if not all_rels:
        print("No relevance data. Stopping.")
        return

    num_queries = min(len(QUERIES), len(all_rels))
    if len(QUERIES) != len(all_rels):
        print(
            f"Warning: {len(QUERIES)} queries but {len(all_rels)} relevance lines. "
            f"Using first {num_queries} pairs."
        )

    ngram_ranges = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3)]
    sublinear_tfs = [False, True]
    min_dfs = [1, 2, 5]
    max_dfs = [0.7, 0.85, 1.0]
    norms = ["l2", "l1", None]

    param_grid = list(product(ngram_ranges, sublinear_tfs, min_dfs, max_dfs, norms))
    print("Σύνολο συνδυασμών:", len(param_grid))

    best_avg_F1_k = -1.0
    best_params = None

    for (ngram_range, sublinear_tf, min_df, max_df, norm) in param_grid:
        if isinstance(min_df, int) and min_df > len(texts):
            continue
        if isinstance(max_df, float) and isinstance(min_df, float) and max_df < min_df:
            continue

        params = {
            "ngram_range": ngram_range,
            "sublinear_tf": sublinear_tf,
            "min_df": min_df,
            "max_df": max_df,
            "norm": norm,
        }

        sum_f1_k = 0.0
        valid_count = 0

        for i in range(num_queries):
            query_text = QUERIES[i]
            relevant_docs = all_rels[i]

            try:
                metrics = evaluate_single_query_with_params(
                    params, query_text, relevant_docs, doc_ids, texts
                )
            except ValueError:
                metrics = None

            if metrics is None:
                continue

            sum_f1_k += metrics["f1_at_k"]
            valid_count += 1

        if valid_count == 0:
            continue

        avg_f1_k = sum_f1_k / valid_count

        print(f"params={params} -> avg F1@k over {valid_count} queries = {avg_f1_k:.4f}")

        if avg_f1_k > best_avg_F1_k:
            best_avg_F1_k = avg_f1_k
            best_params = params

    if best_params is None:
        print("No valid parameter combination found.")
        return

    print("\n==============================")
    print("BEST PARAMETERS (by avg F1@k):")
    print(best_params)
    print(f"Best average F1@k: {best_avg_F1_k:.4f}")

    print("\nPer-query metrics for best params:")
    doc_ids, texts = load_and_preprocess_documents()
    all_rels = load_all_relevant_lists()
    num_queries = min(len(QUERIES), len(all_rels))

    for i in range(num_queries):
        query_text = QUERIES[i]
        relevant_docs = all_rels[i]
        metrics = evaluate_single_query_with_params(
            best_params, query_text, relevant_docs, doc_ids, texts
        )

        if metrics is None:
            print(f"\nQuery #{i+1}: no relevant docs.")
            continue

        k = metrics["k"]
        print(f"\nQuery #{i+1}")
        print("Text:", query_text)
        print("k =", k)
        print(f"Precision@{k} =", metrics["precision_at_k"])
        print("Precision      =", metrics["precision"])
        print("Recall         =", metrics["recall"])
        print("F1 (global)    =", metrics["f1"])
        print(f"F1@{k}         =", metrics["f1_at_k"])
        print("AveragePrec    =", metrics["average_precision"])

In [9]:
#Grid Search on all 270 options
if __name__ == "__main__":
    grid_search_all_queries()


Διαβάστηκαν/προεπεξεργάστηκαν 1239 έγγραφα.
Δημιουργήθηκαν 1239 κείμενα για TF-IDF.
Loaded 20 relevance sets from Relevant.txt
Σύνολο συνδυασμών: 270
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.7, 'norm': 'l2'} -> avg F1@k over 20 queries = 0.2919
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.7, 'norm': 'l1'} -> avg F1@k over 20 queries = 0.2919
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.7, 'norm': None} -> avg F1@k over 20 queries = 0.2919
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.85, 'norm': 'l2'} -> avg F1@k over 20 queries = 0.2919
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.85, 'norm': 'l1'} -> avg F1@k over 20 queries = 0.2919
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.85, 'norm': None} -> avg F1@k over 20 queries = 0.2919
params={'ngram_range': (1, 1), 'sublinear_tf': Fa

In [None]:
# Grid search only in best params
def grid_search_all_queries():
    doc_ids, texts = load_and_preprocess_documents()

    all_rels = load_all_relevant_lists()
    if not all_rels:
        print("No relevance data. Stopping.")
        return

    num_queries = min(len(QUERIES), len(all_rels))
    if len(QUERIES) != len(all_rels):
        print(
            f"Warning: {len(QUERIES)} queries but {len(all_rels)} relevance lines. "
            f"Using first {num_queries} pairs."
        )

    ngram_ranges = [(1, 1)]
    sublinear_tfs = [True]
    min_dfs = [1]
    max_dfs = [0.7]
    norms = ["l2"]

    param_grid = list(product(ngram_ranges, sublinear_tfs, min_dfs, max_dfs, norms))
    print("Σύνολο συνδυασμών:", len(param_grid))

    best_avg_F1_k = -1.0
    best_params = None

    # 4. Iterate parameter combinations
    for (ngram_range, sublinear_tf, min_df, max_df, norm) in param_grid:
        if isinstance(min_df, int) and min_df > len(texts):
            continue
        if isinstance(max_df, float) and isinstance(min_df, float) and max_df < min_df:
            continue

        params = {
            "ngram_range": ngram_range,
            "sublinear_tf": sublinear_tf,
            "min_df": min_df,
            "max_df": max_df,
            "norm": norm,
        }

        sum_f1_k = 0.0
        valid_count = 0

        # Evaluate across all queries
        for i in range(num_queries):
            query_text = QUERIES[i]
            relevant_docs = all_rels[i]

            try:
                metrics = evaluate_single_query_with_params(
                    params, query_text, relevant_docs, doc_ids, texts
                )
            except ValueError:
                # e.g. empty vocabulary
                metrics = None

            if metrics is None:
                continue

            sum_f1_k += metrics["f1_at_k"]
            valid_count += 1

        if valid_count == 0:
            continue

        avg_f1_k = sum_f1_k / valid_count

        print(f"params={params} -> avg F1@k over {valid_count} queries = {avg_f1_k:.4f}")

        if avg_f1_k > best_avg_F1_k:
            best_avg_F1_k = avg_f1_k
            best_params = params

    if best_params is None:
        print("No valid parameter combination found.")
        return

    print("\n==============================")
    print("BEST PARAMETERS (by avg F1@k):")
    print(best_params)
    print(f"Best average F1@k: {best_avg_F1_k:.4f}")

    print("\nPer-query metrics for best params:")
    doc_ids, texts = load_and_preprocess_documents()
    all_rels = load_all_relevant_lists()
    num_queries = min(len(QUERIES), len(all_rels))

    for i in range(num_queries):
        query_text = QUERIES[i]
        relevant_docs = all_rels[i]
        metrics = evaluate_single_query_with_params(
            best_params, query_text, relevant_docs, doc_ids, texts
        )

        if metrics is None:
            print(f"\nQuery #{i+1}: no relevant docs.")
            continue

        k = metrics["k"]
        print(f"\nQuery #{i+1}")
        print("Text:", query_text)
        print("k =", k)
        print(f"Precision@{k} =", metrics["precision_at_k"])
        print("Precision      =", metrics["precision"])
        print("Recall         =", metrics["recall"])
        print("F1 (global)    =", metrics["f1"])
        print(f"F1@{k}         =", metrics["f1_at_k"])
        print("AveragePrec    =", metrics["average_precision"])

In [11]:
if __name__ == "__main__":
    grid_search_all_queries()

Διαβάστηκαν/προεπεξεργάστηκαν 1239 έγγραφα.
Δημιουργήθηκαν 1239 κείμενα για TF-IDF.
Loaded 20 relevance sets from Relevant.txt
Σύνολο συνδυασμών: 1
params={'ngram_range': (1, 1), 'sublinear_tf': True, 'min_df': 1, 'max_df': 0.7, 'norm': 'l2'} -> avg F1@k over 20 queries = 0.3006

BEST PARAMETERS (by avg F1@k):
{'ngram_range': (1, 1), 'sublinear_tf': True, 'min_df': 1, 'max_df': 0.7, 'norm': 'l2'}
Best average F1@k: 0.3006

Per-query metrics for best params:
Διαβάστηκαν/προεπεξεργάστηκαν 1239 έγγραφα.
Δημιουργήθηκαν 1239 κείμενα για TF-IDF.
Loaded 20 relevance sets from Relevant.txt

Query #1
Text: How effective are inhalations of mucolytic agents in the treatment of CF patients
k = 26
Precision@26 = 0.2692307692307692
Precision      = 0.020984665052461663
Recall         = 1.0
F1 (global)    = 0.04110671936758893
F1@26         = 0.2692307692307692
AveragePrec    = 0.22746615788058552

Query #2
Text: What is the role of aerosols in the treatment of lung disease in CF patients
k = 60
Prec

In [21]:
# Grid search across ALL queries: optimize avg Recall
def grid_search_all_queries():
    doc_ids, texts = load_and_preprocess_documents()

    all_rels = load_all_relevant_lists()
    if not all_rels:
        print("No relevance data. Stopping.")
        return

    num_queries = min(len(QUERIES), len(all_rels))
    if len(QUERIES) != len(all_rels):
        print(
            f"Warning: {len(QUERIES)} queries but {len(all_rels)} relevance lines. "
            f"Using first {num_queries} pairs."
        )

    ngram_ranges = [(1, 1), (1, 2), (2, 2), (1, 3), (2, 3)]
    sublinear_tfs = [False, True]
    min_dfs = [1, 2, 5]
    max_dfs = [0.7, 0.85, 1.0]
    norms = ["l2", "l1", None]

    param_grid = list(product(ngram_ranges, sublinear_tfs, min_dfs, max_dfs, norms))
    print("Σύνολο συνδυασμών:", len(param_grid))

    best_avg_precision = -1.0
    best_params = None

    for (ngram_range, sublinear_tf, min_df, max_df, norm) in param_grid:
        if isinstance(min_df, int) and min_df > len(texts):
            continue
        if isinstance(max_df, float) and isinstance(min_df, float) and max_df < min_df:
            continue

        params = {
            "ngram_range": ngram_range,
            "sublinear_tf": sublinear_tf,
            "min_df": min_df,
            "max_df": max_df,
            "norm": norm,
        }

        sum_precision = 0.0
        valid_count = 0

        # Evaluate across all queries
        for i in range(num_queries):
            query_text = QUERIES[i]
            relevant_docs = all_rels[i]

            try:
                metrics = evaluate_single_query_with_params(
                    params, query_text, relevant_docs, doc_ids, texts
                )
            except ValueError:
                metrics = None

            if metrics is None:
                continue

            sum_precision += metrics["average_precision"]
            valid_count += 1

        if valid_count == 0:
            continue

        avg_precision = sum_precision / valid_count

        print(f"params={params} -> avg precision over {valid_count} queries = {avg_precision:.4f}")

        if avg_precision > best_avg_precision:
            best_avg_precision = avg_precision
            best_params = params

    if best_params is None:
        print("No valid parameter combination found.")
        return

    print("\n==============================")
    print("BEST PARAMETERS (by avg precision:")
    print(best_params)
    print(f"Best average recall: {best_avg_precision:.4f}")

    print("\nPer-query metrics for best params:")
    doc_ids, texts = load_and_preprocess_documents()
    all_rels = load_all_relevant_lists()
    num_queries = min(len(QUERIES), len(all_rels))

    for i in range(num_queries):
        query_text = QUERIES[i]
        relevant_docs = all_rels[i]
        metrics = evaluate_single_query_with_params(
            best_params, query_text, relevant_docs, doc_ids, texts
        )

        if metrics is None:
            print(f"\nQuery #{i+1}: no relevant docs.")
            continue

        k = metrics["k"]
        print(f"\nQuery #{i+1}")
        print("Text:", query_text)
        print("k =", k)
        print(f"Precision@{k} =", metrics["precision_at_k"])
        print("Precision      =", metrics["precision"])
        print("Recall         =", metrics["recall"])
        print("F1 (global)    =", metrics["f1"])
        print(f"F1@{k}         =", metrics["f1_at_k"])
        print("AveragePrec    =", metrics["average_precision"])

In [22]:
if __name__ == "__main__":
    grid_search_all_queries()

Διαβάστηκαν/προεπεξεργάστηκαν 1239 έγγραφα.
Δημιουργήθηκαν 1239 κείμενα για TF-IDF.
Loaded 20 relevance sets from Relevant.txt
Σύνολο συνδυασμών: 270
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.7, 'norm': 'l2'} -> avg precision over 20 queries = 0.2656
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.7, 'norm': 'l1'} -> avg precision over 20 queries = 0.2656
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.7, 'norm': None} -> avg precision over 20 queries = 0.2656
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.85, 'norm': 'l2'} -> avg precision over 20 queries = 0.2656
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.85, 'norm': 'l1'} -> avg precision over 20 queries = 0.2656
params={'ngram_range': (1, 1), 'sublinear_tf': False, 'min_df': 1, 'max_df': 0.85, 'norm': None} -> avg precision over 20 queries = 0.2656
params={'ngram_rang