In [1]:
# IR Comparison: TF-IDF + Cosine Similarity vs BM25
# Author: Damascus University IR Practical Lab

# --------------------------
# 1. Setup
# --------------------------

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from rank_bm25 import BM25Okapi
from IPython.display import display, Markdown, HTML

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)
pd.set_option('display.max_columns', None)

# --------------------------
# 2. Documents and Queries
# --------------------------

documents = [
    "Cloud storage is useful for syncing files across devices.",
    "Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Drive, and OneDrive provide cloud storage features.",
    "Files can be kept online and accessed remotely from any location.",
    "Voice assistants like Siri are powered by AI algorithms.",
    "AI is changing the way we interact with technology.",
    "AI is AI is AI is AI. Everything is AI nowadays.",
    "Developers use GitHub to manage cloud-hosted projects.",
    "Cloud APIs let apps sync across devices.",
    "Smart assistants combine voice input and AI processing.",
    "Store files in the cloud and use them on mobile apps."
]

queries = ["cloud storage", "ai assistant"]

# --------------------------
# 3. TF-IDF + Cosine Similarity
# --------------------------

def tfidf_cosine_search(query, docs):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(docs)
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    ranked_indices = cosine_sim.argsort()[::-1]
    return [(i, cosine_sim[i]) for i in ranked_indices]

# --------------------------
# 4. BM25 with adjustable parameters
# --------------------------

def bm25_search(query, docs, k1=1.5, b=0.75):
    tokenized_docs = [doc.lower().split() for doc in docs]
    bm25 = BM25Okapi(tokenized_docs, k1=k1, b=b)
    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)
    ranked_indices = np.argsort(scores)[::-1]
    return [(i, scores[i]) for i in ranked_indices]

# --------------------------
# 5. Run and Compare
# --------------------------

def compare_models(query):
    print(f"\n### Query: '{query}'\n")

    tfidf_results = tfidf_cosine_search(query, documents)
    bm25_results = bm25_search(query, documents, k1=1.5, b=0.75)

    df = pd.DataFrame({
        "Rank": list(range(1, len(documents)+1)),
        "TF-IDF Doc": [documents[i] for i, _ in tfidf_results],
        "TF-IDF Score": [round(score, 3) for _, score in tfidf_results],
        "BM25 Doc": [documents[i] for i, _ in bm25_results],
        "BM25 Score": [round(score, 3) for _, score in bm25_results]
    })

    display(HTML(df.to_html(notebook=True)))


In [2]:

# --------------------------
# 6. Try with Example Queries
# --------------------------

for q in queries:
    compare_models(q)

    



### Query: 'cloud storage'



Unnamed: 0,Rank,TF-IDF Doc,TF-IDF Score,BM25 Doc,BM25 Score
0,1,"Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Drive, and OneDrive provide cloud storage features.",0.808,"Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Drive, and OneDrive provide cloud storage features.",2.274
1,2,Cloud storage is useful for syncing files across devices.,0.403,Cloud storage is useful for syncing files across devices.,1.666
2,3,Cloud APIs let apps sync across devices.,0.145,Cloud APIs let apps sync across devices.,0.425
3,4,Developers use GitHub to manage cloud-hosted projects.,0.128,Store files in the cloud and use them on mobile apps.,0.352
4,5,Store files in the cloud and use them on mobile apps.,0.117,Smart assistants combine voice input and AI processing.,0.0
5,6,Smart assistants combine voice input and AI processing.,0.0,Developers use GitHub to manage cloud-hosted projects.,0.0
6,7,AI is AI is AI is AI. Everything is AI nowadays.,0.0,AI is AI is AI is AI. Everything is AI nowadays.,0.0
7,8,AI is changing the way we interact with technology.,0.0,AI is changing the way we interact with technology.,0.0
8,9,Voice assistants like Siri are powered by AI algorithms.,0.0,Voice assistants like Siri are powered by AI algorithms.,0.0
9,10,Files can be kept online and accessed remotely from any location.,0.0,Files can be kept online and accessed remotely from any location.,0.0



### Query: 'ai assistant'



Unnamed: 0,Rank,TF-IDF Doc,TF-IDF Score,BM25 Doc,BM25 Score
0,1,AI is AI is AI is AI. Everything is AI nowadays.,0.708,AI is AI is AI is AI. Everything is AI nowadays.,0.655
1,2,Smart assistants combine voice input and AI processing.,0.263,Smart assistants combine voice input and AI processing.,0.404
2,3,Voice assistants like Siri are powered by AI algorithms.,0.24,AI is changing the way we interact with technology.,0.385
3,4,AI is changing the way we interact with technology.,0.238,Voice assistants like Siri are powered by AI algorithms.,0.385
4,5,Store files in the cloud and use them on mobile apps.,0.0,Store files in the cloud and use them on mobile apps.,0.0
5,6,Cloud APIs let apps sync across devices.,0.0,Cloud APIs let apps sync across devices.,0.0
6,7,Developers use GitHub to manage cloud-hosted projects.,0.0,Developers use GitHub to manage cloud-hosted projects.,0.0
7,8,Files can be kept online and accessed remotely from any location.,0.0,Files can be kept online and accessed remotely from any location.,0.0
8,9,"Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Drive, and OneDrive provide cloud storage features.",0.0,"Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Drive, and OneDrive provide cloud storage features.",0.0
9,10,Cloud storage is useful for syncing files across devices.,0.0,Cloud storage is useful for syncing files across devices.,0.0


In [3]:

# --------------------------
# 7. Show Multi-word Query Edge Case
# --------------------------

multiword_docs = [
    "AI is transforming everything.",
    "Voice assistants respond using artificial intelligence.",
    "Assistants can help with reminders.",
    "Artificial Intelligence powers assistant features.",
    "This doc says nothing about ai or assistants."
]

multiword_query = "ai assistant"

def multiword_comparison():
    print("\n### Multi-word Query: 'ai assistant'\n")
    tfidf_res = tfidf_cosine_search(multiword_query, multiword_docs)
    bm25_res = bm25_search(multiword_query, multiword_docs)

    df = pd.DataFrame({
        "Rank": list(range(1, len(multiword_docs)+1)),
        "TF-IDF Doc": [multiword_docs[i] for i, _ in tfidf_res],
        "TF-IDF Score": [round(score, 3) for _, score in tfidf_res],
        "BM25 Doc": [multiword_docs[i] for i, _ in bm25_res],
        "BM25 Score": [round(score, 3) for _, score in bm25_res]
    })

    display(HTML(df.to_html(notebook=True)))

multiword_comparison()



### Multi-word Query: 'ai assistant'



Unnamed: 0,Rank,TF-IDF Doc,TF-IDF Score,BM25 Doc,BM25 Score
0,1,Artificial Intelligence powers assistant features.,0.375,Artificial Intelligence powers assistant features.,1.154
1,2,AI is transforming everything.,0.265,AI is transforming everything.,0.386
2,3,This doc says nothing about ai or assistants.,0.19,This doc says nothing about ai or assistants.,0.282
3,4,Assistants can help with reminders.,0.0,Assistants can help with reminders.,0.0
4,5,Voice assistants respond using artificial intelligence.,0.0,Voice assistants respond using artificial intelligence.,0.0


In [4]:

# --------------------------
# 8. BM25 Parameter Variants
# --------------------------

print("\n\n### BM25 with Different Parameters for Query: 'cloud storage'\n")
for k1_val in [0.2, 0.5, 1.5, 2.0]:
    for b_val in [0.0, 0.25, 0.75, 1.0]:
        print(f"\n-- BM25 k1={k1_val}, b={b_val} --")
        bm25_results = bm25_search("cloud storage", documents, k1=k1_val, b=b_val)
        for rank, (i, score) in enumerate(bm25_results[:3], 1):
            print(f"{rank}. Doc {i} | Score: {round(score, 3)} | {documents[i][:80]}...")




### BM25 with Different Parameters for Query: 'cloud storage'


-- BM25 k1=0.2, b=0.0 --
1. Doc 1 | Score: 1.797 | Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Dr...
2. Doc 0 | Score: 1.592 | Cloud storage is useful for syncing files across devices....
3. Doc 9 | Score: 0.368 | Store files in the cloud and use them on mobile apps....

-- BM25 k1=0.2, b=0.25 --
1. Doc 1 | Score: 1.776 | Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Dr...
2. Doc 0 | Score: 1.598 | Cloud storage is useful for syncing files across devices....
3. Doc 7 | Score: 0.372 | Cloud APIs let apps sync across devices....

-- BM25 k1=0.2, b=0.75 --
1. Doc 1 | Score: 1.736 | Cloud storage cloud storage cloud storage. Many services like Dropbox, Google Dr...
2. Doc 0 | Score: 1.612 | Cloud storage is useful for syncing files across devices....
3. Doc 7 | Score: 0.382 | Cloud APIs let apps sync across devices....

-- BM25 k1=0.2, b=1.0 --
1. Doc 1 | Scor