In [14]:
# Import necessary libraries
import pandas as pd
import numpy as np
import pandas as pd
import pickle
import json
from InstructorEmbedding import INSTRUCTOR
import gzip
import os
import torch

In [2]:
# Read the data from a CSV file and select relevant columns
df_queries = pd.read_csv('antique_queries.csv')
df_queries = df_queries[['query_id','text']]

df_docs = pd.read_csv('antique_data.csv')
df_docs = df_docs[['doc_id','text']]

df_qrel = pd.read_csv('antique_qrels.csv')
df_qrel = df_qrel[['query_id','doc_id','relevence']]

In [3]:
# Merge the relevance data with the document data based on doc_id
merged_df = df_qrel.merge(df_docs, on='doc_id', how='left')

In [5]:
def map_values(value):
    if value in [3, 4]:
        return 1
    else:
        return 0
# Apply the mapping function to create binary labels and add to the relevance data
df_qrel['label'] = df_qrel['relevence'].apply(map_values)

# Drop duplicate entries based on query_id and doc_id in the relevance data
df_qrel1 = df_qrel.drop_duplicates(subset=['query_id','doc_id'])
df_qrel1

Unnamed: 0,query_id,doc_id,relevence,label
0,1964316,1964316_5,4,1
1,1964316,1674088_11,1,0
2,1964316,1218838_13,2,0
3,1964316,1519022_15,2,0
4,1964316,3059341_5,2,0
...,...,...,...,...
6584,1262692,247023_6,3,1
6585,1262692,1499030_5,3,1
6586,1262692,2916758_0,3,1
6587,1262692,1105845_15,3,1


In [6]:
df_qrel1 = df_qrel1.merge(df_queries, on='query_id', how='left')


In [7]:
df_qrel1.head()

Unnamed: 0,query_id,doc_id,relevence,label,text
0,1964316,1964316_5,4,1,"What do you mean by ""weed""?"
1,1964316,1674088_11,1,0,"What do you mean by ""weed""?"
2,1964316,1218838_13,2,0,"What do you mean by ""weed""?"
3,1964316,1519022_15,2,0,"What do you mean by ""weed""?"
4,1964316,3059341_5,2,0,"What do you mean by ""weed""?"


In [8]:
# Display information about the merged DataFrame
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6589 entries, 0 to 6588
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   query_id   6589 non-null   int64 
 1   doc_id     6589 non-null   object
 2   relevence  6589 non-null   int64 
 3   text       6589 non-null   object
dtypes: int64(2), object(2)
memory usage: 257.4+ KB


In [9]:
# Extract the 'doc_id' and 'text' columns from the merged DataFrame
df_text = merged_df[['doc_id','text']]

# Initialize an empty list to store passages
passages = []

# Iterate through each row in the 'df_text' DataFrame and append text to the 'passages' list
for index, row in df_text.iterrows():
    passages.append(str(row['text']))

In [10]:
# Display the first 5 passages
passages[0:5]

['Weed could mean the bad thing that grow in ur graden or back and front yard. Or it could mean the drug.',
 'sell weed',
 'My weed!!',
 'because we dont know what the hell to make legal in the US anymore....i mean we still have Bush in office, you would think that legalizing weed would be less harsh of the 2',
 'Its a weed.']

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('intfloat/e5-large-v2')
embeddings = model.encode(passages,convert_to_tensor=True,show_progress_bar=True)

In [22]:
import pickle
with open('corpus_embeddings_text.pickle', 'wb') as pkl:
    pickle.dump(embeddings, pkl)

In [12]:
# Import necessary libraries
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity


In [15]:
with open('corpus_embeddings_text.pickle', 'rb') as pkl:
    doc_embedding = pickle.load(pkl)

In [16]:

def fetch_relevant_documents(query_embedding, document_embeddings, top_k=10):
    # Calculate cosine similarity between the query embedding and document embeddings
    similarities = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)[0]
    
    # Sort document indices based on similarity scores (higher is better)
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get the top-k most relevant document indices and their similarity scores
    top_indices = sorted_indices[:top_k]
    top_similarity_scores = similarities[top_indices]
    
    return top_indices, top_similarity_scores

# Load embeddings from pickle files

for i in range(5):
    query_embed = model.encode(df_qrel1['text'][i],convert_to_tensor=True)
    relevant_document_indices, similarity_scores = fetch_relevant_documents(query_embed, doc_embedding)

    print(f"Most relevant documents for query {i+1}:", relevant_document_indices)
    print("Corresponding similarity scores:", similarity_scores)


Most relevant documents for query 1: [21  0  9  4  2  8 26  6 29 18]
Corresponding similarity scores: [0.85521793 0.8483332  0.84732294 0.8440726  0.8430238  0.83978856
 0.83952945 0.8394203  0.8377141  0.83636487]
Most relevant documents for query 2: [21  0  9  4  2  8 26  6 29 18]
Corresponding similarity scores: [0.85521793 0.8483332  0.84732294 0.8440726  0.8430238  0.83978856
 0.83952945 0.8394203  0.8377141  0.83636487]
Most relevant documents for query 3: [21  0  9  4  2  8 26  6 29 18]
Corresponding similarity scores: [0.85521793 0.8483332  0.84732294 0.8440726  0.8430238  0.83978856
 0.83952945 0.8394203  0.8377141  0.83636487]
Most relevant documents for query 4: [21  0  9  4  2  8 26  6 29 18]
Corresponding similarity scores: [0.85521793 0.8483332  0.84732294 0.8440726  0.8430238  0.83978856
 0.83952945 0.8394203  0.8377141  0.83636487]
Most relevant documents for query 5: [21  0  9  4  2  8 26  6 29 18]
Corresponding similarity scores: [0.85521793 0.8483332  0.84732294 0.84

In [64]:
def fetch_documents(query_embedding, document_embeddings, top_k=170):
    # Calculate cosine similarity between the query embedding and document embeddings
    similarities = cosine_similarity(query_embedding.reshape(1, -1), document_embeddings)[0]
    
    # Sort document indices based on similarity scores (higher is better)
    sorted_indices = np.argsort(similarities)[::-1]
    
    # Get the top-k most relevant document indices and their similarity scores
    top_indices = sorted_indices[:top_k]
    
    return top_indices

In [18]:
doc_ids = df_qrel1['doc_id'].tolist()

In [65]:
# Calculate recall for each query
def calculate_recall(query_id, qrel_df):
    relevant_documents = df_qrel1[(df_qrel1['query_id'] == query_ids[i]) & (df_qrel1['label'] == 1)]['doc_id'].tolist()
    #print(relevant_documents)
    query = df_qrel1[df_qrel1['query_id'] == query_ids[i]]['text'].iloc[0]
    #print(query)
    query_embed = model.encode(query,convert_to_tensor=True)
    retrieved_index = fetch_documents(query_embed, doc_embedding) 
    retrieved_doc_ids = [doc_ids[i] for i in retrieved_index]
    #print(retrieved_doc_ids)
    retrieved_relevant_documents = list(set(relevant_documents) & set(retrieved_doc_ids))
    #print(retrieved_relevant_documents)
    try:
        recall = len(retrieved_relevant_documents) / len(relevant_documents)
    except ZeroDivisionError:
        recall = 0.0  
    return recall

# List of unique query IDs
query_ids = df_qrel1['query_id'].unique()

total_recall = 0

for i in range(len(query_ids)):
    recall = calculate_recall(query_ids[i], df_qrel1)
    total_recall += recall

average_recall = total_recall / len(query_ids)
print("Average Recall:", average_recall)


Average Recall: 0.9075540009933921
