# Validation of BM25 and BERT
We created this seperate notebook for the validation so that the main project does not get too cluttered, also some changes to read in the files had to be done, which would clutter the main file


In [1]:
#imports
import pandas as pd
#from gpt4all import GPT4All
#import gpt4all
#path = "C:\Users\Jakob\Downloads\gpt4all-falcon-q4_0.gguf"

from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *

import nltk
nltk.download('stopwords')
nltk.download('punkt')

import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

import os
import requests
from zipfile import ZipFile

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Jakob\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jakob\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


# Create Validation Data


In [2]:
def download_and_extract(url, destination_folder, filename):

    # create folder if not exists
    os.makedirs(destination_folder, exist_ok=True)

    filename = os.path.join(destination_folder, filename)

    # check if file already exists
    if not os.path.exists(filename):
        # download file
        response = requests.get(url, stream=True)
        with open(filename, 'wb') as file:
            for chunk in response.iter_content(chunk_size=128):
                file.write(chunk)

    # extract the zip
    with ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(destination_folder)

In [3]:
# data set: https://ir-datasets.com/wikir.html
zip_file_url = "https://zenodo.org/record/3565761/files/wikIR1k.zip"
destination_folder = "./validationDataset/"
filename = "wikIR1k.zip"
download_and_extract(zip_file_url, destination_folder, filename)

In [4]:
def create_subset_validation_with_topic(documents_filename, qrels_filename, queries_filename, subset_docs_filename, subset_queries_filename, lines_per_file, topic):
    documents_df = pd.read_csv(documents_filename)
    qrels_df = pd.read_csv(qrels_filename, sep='\t', header=None, names=['q_id', 'unused', 'doc_id', 'relevance'])
    queries_df = pd.read_csv(queries_filename)

    topic_docs_df = documents_df[documents_df['text_right'].str.contains(topic, case=False)]
    
    # get all q_ids where any document has the word topic in it
    q_ids_needed = []
    for _, row in topic_docs_df.iterrows():
        id_right = row['id_right']
        matching_q_ids = qrels_df[qrels_df['doc_id'] == id_right]['q_id'].tolist()
        q_ids_needed.extend(matching_q_ids) # so no list of lists is created, multiple maches are just appended as elements
    q_ids_needed = list(set(q_ids_needed))

    # shorten the data by shortening the amount of queries
    q_ids_needed = q_ids_needed[:lines_per_file]

    # get all doc_ids out of q_rels that correspond to a q_id with topic in it
    # this should be larger than q_ids_needed because one query has multiple docs
    selected_doc_ids = []
    for q_id in q_ids_needed:
        if isinstance(q_id, list):
            selected_doc_ids.extend(qrels_df[qrels_df['q_id'].isin(q_id)]['doc_id'].tolist())
        else:
            selected_doc_ids.extend(qrels_df[qrels_df['q_id'].isin([q_id])]['doc_id'].tolist())

    subset_queries_df = queries_df[queries_df['id_left'].isin(q_ids_needed)]
    subset_queries_df.to_csv(subset_queries_filename, index=False)

    subset_docs_df = documents_df[documents_df['id_right'].isin(selected_doc_ids)]
    subset_docs_df.to_csv(subset_docs_filename, index=False)


In [5]:
create_subset_validation_with_topic('./validationDataset/wikIR1k/documents.csv', './validationDataset/wikIR1k/training/qrels', './validationDataset/wikIR1k/training/queries.csv', './validationDataset/validationDatset.csv', './validationDataset/validationQueries.csv', 50, ' sport ')

# BM25
Same as in main project but with some changes to data input because of a different format

In [6]:
def file_iterator(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            split = line.split(",")
            id = split[0]
            #join in the rare case this sequenze occurs more than once
            text = ''.join(map(str, split[1:]))
            yield (id, text)

def preprocess_Data(filename):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    df = {}

    counter = 1
    for (id, text) in file_iterator(filename):
        
        #Tokenize
        text = word_tokenize(text)

        #convert words to lowercase
        text = [t.lower() for t in text]

        #remove punctuation
        text = [t for t in text if t.isalnum() or t.isspace()]

        #remove stopwords
        text = [t for t in text if t not in stopwords_english]

        #stemm
        text = [stemmer.stem(t) for t in text]
        
        df[id] = text

        counter += 1

    return df


In [7]:
class BM25:
    def __init__(self, data):
        self.data = data
        self.k1 = 1.5
        self.b = 0.75
        self.inverted_index = self.calc_inverted_index()
        self.bm25 = self.calc_bm25_matrix()


    def calc_inverted_index(self) -> dict[str, list[str]]:

        inverted_index = {}
        
        for doc_id, words in self.data.items():
            for word in words:
                if word not in inverted_index.keys():
                    inverted_index[word] = []
                    inverted_index[word].append(doc_id)
                    
                else:
                    if doc_id not in inverted_index[word]:
                        inverted_index[word].append(doc_id)

        return inverted_index


    def calc_bm25_matrix(self):
            
        bm25 = {}     

        #construct vector for every word
        average_document_length = self.average_document_length()
        for doc_id, words in self.data.items():
            bm25[doc_id] = {}
            document_unique = np.unique(words)
            document = words
            documnet_id = doc_id
            for word in document_unique:
                idf = self.calculate_idf(len(self.inverted_index[word]))
                self.calc_bm25(word, bm25, idf, average_document_length, document, documnet_id)

        return bm25

    def calc_bm25(self, word, bm25, idf, average_document_length, document, documnet_id):
        
        term_frequency = document.count(word)
        
        numerator = idf * term_frequency * (self.k1 + 1)
        denominator = term_frequency + self.k1 * (1 - self.b + ((self.b * len(document)) / average_document_length))
        bm25[documnet_id][word] = numerator / denominator

    def average_document_length(self):
        element_counter = 0
        word_counter = 0
        for _, words in self.data.items():
            element_counter += 1
            word_counter += len(words)

        return word_counter / element_counter
    
    def calculate_idf(self, amount_documents_including_word):
        return np.log(len(self.data) / amount_documents_including_word)
    
    def preprocess_query(self, query: str):
        stemmer = PorterStemmer()
        stopwords_english = stopwords.words('english')

        #Tokenize
        query = word_tokenize(query)

        #convert words to lowercase
        query = [t.lower() for t in query]

        #remove punctuation
        query = [t for t in query if t.isalnum or t.isspace()]

        #remove stopwords
        query = [t for t in query if t not in stopwords_english]

        #stemm
        query = [stemmer.stem(t) for t in query]

        return query


    def retrieve_relevance(self, query: str, k: int) -> dict[str, float]:
        ### ADD YOUR CODE (BEGIN) ###

        query = self.preprocess_query(query)

        #filter out word not contained in any document for efficiency
        query = [word for word in query if word in self.inverted_index.keys()]

        bm25_current_query = {}

        for doc_id, words in self.data.items():
            bm25_current_query[doc_id] = []
            for word in query:
                if word in self.bm25[doc_id]:
                    bm25_current_query[doc_id].append(self.bm25[doc_id][word])
                else: 
                    bm25_current_query[doc_id].append(0)

        for key, value in bm25_current_query.items():
            bm25_current_query[key] = sum(value)

        sorted_bm25 = sorted(bm25_current_query.items(), key=lambda x:x[1], reverse=True)

        sorted_bm25 = sorted_bm25[:k]
        bm25_current_query = dict(sorted_bm25)

        return bm25_current_query


# MonoBert

In [8]:
# download model

model_name = "castorini/monobert-large-msmarco"
model_path = "./model"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.save_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(model_path)

('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.txt',
 './model\\added_tokens.json',
 './model\\tokenizer.json')

In [9]:
model_path = "./model"

local_model = AutoModelForSequenceClassification.from_pretrained(model_path)
local_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [10]:
def get_relevance_bert(query, documents):
    relevances = []
    for document in documents:
        
        inputs = tokenizer(query, document[1], return_tensors="pt", padding=True, truncation=True)

        # predictions    
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits

        # e.g.:
        # outputs: SequenceClassifierOutput(loss=None, logits=tensor([[-1.8822,  2.8848]]), hidden_states=None, attentions=None)
        # logits: tensor([[-1.8822,  2.8848]])

        # re-scale to [0,1] and so that sum == 1
        relevance = torch.nn.functional.softmax(logits, dim=1)
        relevances.append([document[0], relevance[0].tolist()[1], document[1]])

    return relevances

In [11]:
def order_relevance(relevances):
    relevances = sorted(relevances, reverse=True, key=lambda x: x[1])
    return relevances

# Pipeline

In [12]:
class Pipeline:
    def __init__(self, amount_bm25_fetched_documents, amount_of_queries):
        data=self.getData()
        self.bm25 = BM25(data)
        self.documents = self.getCompleteFiles()
        self.amount_bm25_fetched_documents = amount_bm25_fetched_documents
        self.queries = self.get_queries_from_doc()
        self.amount_of_queries = amount_of_queries

    def getData(self):
        data = preprocess_Data('./validationDataset/validationDatset.csv')
        return data
    
    def getCompleteFiles(self):
        documents = {}

        #read LLM generated Documents
        with open('./validationDataset/validationDatset.csv', 'r', encoding='utf-8') as file:
            for line in file:
                split = line.split(",")
                id = split[0]
                #join in the rare case this sequenze occurs more than once
                text = ''.join(map(str, split[1:]))
                documents[id] = text
        
        return documents
    
    def retrieve_relevant_documents_with_BERT(self, query, documents):
        relevances = get_relevance_bert(query, documents)
        relevances = order_relevance(relevances)   
        return relevances 
    
    def retrieve_relevant_documents(self, query):

        #retrieve amount_bm25_fetched_documents documents using fast bm25
        bm25_docs = self.bm25.retrieve_relevance(query, self.amount_bm25_fetched_documents)

        #fetch the non preprocessed documents retrieved by bm25
        retrieved_documents = []
        for doc_id in bm25_docs:
            retrieved_documents.append([doc_id, self.documents[doc_id]])

        #rerank these documents using BERT
        ranked_document_list = self.retrieve_relevant_documents_with_BERT(query, retrieved_documents)

        return ranked_document_list

    def get_queries_from_doc(self):
        queries_list = []
        with open('./validationDataset/validationQueries.csv', 'r', encoding='utf-8') as file:
            for line in file:
                split = line.split(",")
                text = split[1]
                text = text.strip()
                queries_list.append(text)
            queries_list = queries_list[1:]    
        return queries_list
    
    def run_queries(self):
        ret = []
        for query in self.queries[:self.amount_of_queries]:
            ret.append((query, self.retrieve_relevant_documents(query)))

        return ret


# Validation
Validate BM25, Bert, and the pipeline

In [21]:
class Validation:
    def __init__(self, valDocs,valQueries,qrels):
        self.queries = self.createQueriesDF(valQueries)
        self.topicQrels = self.createQrels(valQueries,qrels)
        self.bm = BM25(preprocess_Data('./validationDataset/validationDatset.csv'))
        self.pip = Pipeline(5,len(self.queries))

    def createQueriesDF(self,valQueries):
        queries = {}
        fp = open(valQueries, 'r')
        for line in fp:
            parts = line.strip().split(',')
            query_id, title = parts
            queries[query_id] = title
        fp.close()     
        del queries[list(queries)[0]]           
        return queries

    def createQrels(self,valQueries,qrels):
        qrelsDict = self.queries.copy()
        for key in qrelsDict:
            qrelsDict[key] = []
        fp = open(qrels, 'r')
        for line in fp:
            parts = line.strip().split()
            if len(parts) == 4:
                query_id, _, doc_id, _ = parts
                if query_id in qrelsDict:
                    qrelsDict[query_id].append(doc_id)
        return qrelsDict
    
    def generatePredsBm25(self,k: int = 5):
        predictions = {}
        for id, query in self.queries.items():
            preds = self.bm.retrieve_relevance(query,k)
            predictions[id] = preds
        return predictions
    
    def generatePredsPipe(self,k: int = 5):
        preds = {}
        predictions = self.pip.run_queries()
        for i in predictions:
            id = 0
            # print(i[1])
            for key,val in self.queries.items():
                if val == i[0]:
                    id = key
                    preds[id] = {}
                    break
            for j in i[1]:
                preds[id][j[0]] = j[1]      

        return preds

    def f1atK(self, predictions: dict[str, list[str]], k: int = 5):
        counter = 0
        recall = 0
        precision = 0
        
        for key, value in self.topicQrels.items():
            if key not in predictions:
                continue
            counter += 1
            true_positives = 0
            false_negatives = 0
            false_positives = 0
            
            for prediction in predictions[key]:
                if prediction in value:
                    true_positives += 1
                else:
                    false_positives += 1

            false_negatives = (k - true_positives) if len(value) > k else (len(value) - true_positives)

            precision += true_positives / (true_positives + false_positives)
            recall += true_positives / (true_positives + false_negatives)

        recall /= counter
        precision /= counter

        #F1@k value
        return (2 * precision * recall) / (precision + recall)
    
        
    
    def ncdgk(self, predictions: dict[str, list[str]], k: int = 5):
        counter = 0
        ndcg = 0
        
        for key, value in self.topicQrels.items():
            if key not in predictions:
                continue

            counter += 1
            
            rel_i = []
            for prediction in predictions[key]:
                if prediction in value:
                    rel_i.append(1)
                else:
                    rel_i.append(0)

            dcg = 0
            for index, rel in enumerate(rel_i):
                dcg += rel / np.log(index+2)

            rel_i.sort(reverse=True)

            idcg = 0
            for index, rel in enumerate(rel_i):
                idcg += rel / np.log(index+2)

            if idcg != 0:
                ndcg += dcg / idcg

        return ndcg / counter


In [22]:
val = Validation('./validationDataset/validationDatset.csv','./validationDataset/validationQueries.csv','./validationDataset/wikIR1k/training/qrels')
#print(val.queries.iloc[0])
#print(val.queries['qID'][0])
#print(val.queries)
#print(val.topicQrels)
preds = val.generatePredsBm25()
f1atk = val.f1atK(preds)
ncdgk = val.ncdgk(preds)
preds2 = val.generatePredsPipe()
f1atk2 = val.f1atK(preds2)
ncdgk2 = val.ncdgk(preds2)
#print(preds)
print("BM25")
print("F1@k score: " + str(f1atk))
print("nDCG@k score: "+ str(ncdgk))
print("------------------------")
#print(preds2)
print("BM25 + MonoBert")
print("F1@k score: " + str(f1atk2))
print("nDCG@k score: "+ str(ncdgk2))

BM25
F1@k score: 0.8240000000000001
nDCG@k score: 0.9815848099359128
------------------------
BM25 + MonoBert
F1@k score: 0.8240000000000001
nDCG@k score: 0.988581683665839
