In [3]:
import json
import os

# Create a sample dataset in JSON format
data_dir = "datasets"
os.makedirs(data_dir, exist_ok=True)

# Sample dataset to simulate the Q&A setup
sample_data = {
    "corpus": {
        "doc1": "What is artificial intelligence?",
        "doc2": "Explain machine learning and its applications.",
        "doc3": "How does deep learning differ from traditional machine learning?"
    },
    "queries": [
        "What is AI?",
        "What are machine learning applications?",
        "Difference between deep learning and machine learning"
    ]
}

# Save sample data
dataset_path = os.path.join(data_dir, "fiqa.json")
with open(dataset_path, "w") as f:
    json.dump(sample_data, f)

# Function to load dataset
def load_dataset(data_dir, dataset):
    with open(os.path.join(data_dir, dataset), "r") as f:
        data = json.load(f)
    corpus = data["corpus"]
    queries = data["queries"]
    return corpus, queries

corpus, queries = load_dataset(data_dir, "fiqa.json")


In [5]:
!pip install sentence-transformers


Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
     ------------------------------------ 245.3/245.3 kB 791.4 kB/s eta 0:00:00
Collecting huggingface-hub>=0.19.3
  Downloading huggingface_hub-0.25.1-py3-none-any.whl (436 kB)
     -------------------------------------- 436.4/436.4 kB 2.3 MB/s eta 0:00:00
Collecting transformers<5.0.0,>=4.38.0
  Downloading transformers-4.45.1-py3-none-any.whl (9.9 MB)
     ---------------------------------------- 9.9/9.9 MB 1.8 MB/s eta 0:00:00
Collecting fsspec>=2023.5.0
  Downloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
     -------------------------------------- 179.3/179.3 kB 1.4 MB/s eta 0:00:00
Collecting safetensors>=0.4.1
  Downloading safetensors-0.4.5-cp310-none-win_amd64.whl (285 kB)
     -------------------------------------- 285.9/285.9 kB 2.2 MB/s eta 0:00:00
Collecting tokenizers<0.21,>=0.20
  Downloading tokenizers-0.20.0-cp310-none-win_amd64.whl (2.3 MB)
     -------------------

In [7]:
from sentence_transformers import SentenceTransformer

# Load embedding models
embedding_model_small = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embedding_model_large = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

# Example corpus
corpus = {
    1: "The quick brown fox jumps over the lazy dog.",
    2: "I love machine learning.",
    3: "Transformers are very powerful for NLP tasks."
}

# Encode corpus using embedding models
corpus_embeddings_small = embedding_model_small.encode(list(corpus.values()), convert_to_tensor=True)
corpus_embeddings_large = embedding_model_large.encode(list(corpus.values()), convert_to_tensor=True)

print("Embeddings using the small model:")
print(corpus_embeddings_small)

print("\nEmbeddings using the large model:")
print(corpus_embeddings_large)



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embeddings using the small model:
tensor([[ 0.0439,  0.0589,  0.0482,  ...,  0.0522,  0.0561,  0.1021],
        [-0.0169, -0.0707,  0.0855,  ...,  0.0989,  0.0128, -0.0849],
        [-0.0727, -0.0169,  0.0228,  ...,  0.0767,  0.0406,  0.0370]])

Embeddings using the large model:
tensor([[-0.0321,  0.0386, -0.0312,  ...,  0.0085, -0.0138,  0.0431],
        [-0.0031, -0.0556, -0.0267,  ...,  0.0325, -0.0649, -0.0093],
        [ 0.0010, -0.0345, -0.0188,  ..., -0.0308, -0.0089,  0.0098]])


In [10]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load cross-encoder reranking models
reranker_small = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')
reranker_large = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2-v2')

tokenizer_small = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-12-v2')
tokenizer_large = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-TinyBERT-L-2-v2')

# Sample corpus and queries
corpus = {
    'doc1': "This is the first document.",
    'doc2': "This document is the second document.",
    'doc3': "And this is the third one.",
    'doc4': "Is this the first document?"
}
queries = ["first document", "second document"]

# Simulated retrieval function (replace this with your actual embedding retrieval)
def simulate_retrieval(corpus, queries):
    # Simulating retrieval by returning document IDs for each query
    # Each query gets the top 2 documents (for demonstration)
    return [
        [{'corpus_id': 0}, {'corpus_id': 3}],  # For "first document"
        [{'corpus_id': 1}, {'corpus_id': 2}]   # For "second document"
    ]

# Step 1: Candidate Retrieval
retrieved_small = simulate_retrieval(corpus, queries)
retrieved_large = simulate_retrieval(corpus, queries)

def rerank(corpus, queries, retrieved, reranker, tokenizer):
    reranked_results = []
    
    for query, hits in zip(queries, retrieved):
        reranked_scores = []
        for hit in hits:  # Each hit has 'corpus_id'
            passage = corpus[f'doc{hit["corpus_id"] + 1}']  # Corpus starts with doc1
            inputs = tokenizer(query, passage, return_tensors="pt", truncation=True)
            outputs = reranker(**inputs)
            score = outputs.logits.item()
            reranked_scores.append((hit['corpus_id'], score))
        
        # Sort based on score
        reranked_results.append(sorted(reranked_scores, key=lambda x: x[1], reverse=True))
    
    return reranked_results

# Step 2: Rerank using small and large models
reranked_small = rerank(corpus, queries, retrieved_small, reranker_small, tokenizer_small)
reranked_large = rerank(corpus, queries, retrieved_large, reranker_large, tokenizer_large)

print("Reranked results using the small model:", reranked_small)
print("Reranked results using the large model:", reranked_large)

Reranked results using the small model: [[(0, 8.202035903930664), (3, 6.538022041320801)], [(1, 8.006841659545898), (2, -6.739014625549316)]]
Reranked results using the large model: [[(0, 9.328752517700195), (3, 8.882637977600098)], [(1, 9.346213340759277), (2, -10.456615447998047)]]


In [11]:
import numpy as np

# Calculate NDCG@10
def calculate_ndcg(reranked_results, true_relevant_docs, k=10):
    def dcg(relevances):
        return sum([rel / np.log2(idx + 2) for idx, rel in enumerate(relevances)])
    
    def ideal_dcg(relevant_docs):
        return dcg([1] * min(k, len(relevant_docs)))
    
    ndcg_scores = []
    for i, reranked in enumerate(reranked_results):
        relevances = [1 if doc_id in true_relevant_docs[i] else 0 for doc_id, _ in reranked[:k]]
        ndcg_scores.append(dcg(relevances) / ideal_dcg(true_relevant_docs[i]))
    
    return np.mean(ndcg_scores)

# Assuming true_relevant_docs is the ground truth mapping
true_relevant_docs = [
    [0],  # For "What is AI?" -> doc1 is relevant
    [1],  # For "What are ML applications?" -> doc2 is relevant
    [2]   # For "Difference between DL and ML?" -> doc3 is relevant
]

ndcg_small = calculate_ndcg(reranked_small, true_relevant_docs, k=10)
ndcg_large = calculate_ndcg(reranked_large, true_relevant_docs, k=10)

print(f"NDCG@10 (Small Model): {ndcg_small}")
print(f"NDCG@10 (Large Model): {ndcg_large}")


NDCG@10 (Small Model): 1.0
NDCG@10 (Large Model): 1.0


In [None]:
#text_retrieval_assignment