In [None]:
#imports
import pandas as pd
from gpt4all import GPT4All
import gpt4all
#path = "C:\Users\Jakob\Downloads\gpt4all-falcon-q4_0.gguf"

from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *

import numpy as np

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

Preprocessing of our data (creating smaller subset)

In [None]:
def split_file_by_lines(filename, lines_per_file,create_file):
    with open(filename, 'r', encoding='utf-8') as file: 
        count = 0
        current_file = open(create_file, 'w', encoding='utf-8') 
        
        numLine = 0
        count = 0
        for line in file:
            if(numLine > lines_per_file):
                break
            if (count % 1000) == 0:
                current_file.write(line)
                numLine += 1
            count += 1    


        current_file.close()

In [None]:
def split_file_by_lines_topic(filename, lines_per_file,create_file,topic):
    with open(filename, 'r', encoding='utf-8') as file: 
        count = 0
        current_file = open(create_file, 'w', encoding='utf-8') 
        
        numLine = 0
        count = 0
        for line in file:
            if(numLine > lines_per_file):
                break
            split_line = line.split(' ||| ')
            if topic in split_line[1]:
                current_file.write(line)
                numLine += 1
            count += 1    


        current_file.close()

get the complete titles for the queries in a dataframe

In [None]:
def txt_to_df_title(filename):
    fp = open(filename, 'r', encoding='utf-8')
    queries = []
    for entry in fp:
        query, _ = entry.split(' ||| ', 1)
        queries.append(query)
    df_queries = pd.DataFrame(queries, columns=['Query'])
    fp.close()
    return df_queries

In [None]:
def txt_to_df(filename):
    df = pd.read_csv(filename, delimiter=' \|\|\| ', engine='python', header=None, names=['Title', 'Text'])
    return df

Init the LLM

In [None]:
llm = GPT4All(model_name="gpt4all-falcon-q4_0.gguf",model_path="./model")

Generate a .txt file with the response of the llm as a document

In [None]:
def generateDocWithLLM(queries):
    fp = open("llmDocs.txt", 'a')
    index = 0
    for title in queries['Title']:
        query = f'give me a short summary on {title}'
        input = query
        #print(input)
        with llm.chat_session():
            response = llm.generate(input, temp=0)
        #print(response)
        fp.write(str(title) + " ||| " + str(response) + "\n")
        # if(index == 3):
        #     break
        index += 1
        if(index % 50 == 0):
            print(index)
    fp.close()
    print("finished")  

In [None]:
#split_file_by_lines('data/stemmed.txt', 1000,'data/wikipediaSubset.txt')
#split_file_by_lines('data/raw.txt', 1000,'data/textForQueries.txt')
#df = txt_to_df('data/wikipediaSubset.txt')
#dfQ = txt_to_df_title('data/textForQueries.txt')


#split_file_by_lines_topic('data/raw.txt', 1000,'data/topic.txt',' sport ')
topic_dataframe = txt_to_df('data/topic.txt')

#display(df)
#display(dfQ)
generateDocWithLLM(topic_dataframe)

Preprocesses the data generated by the LLM in the same way as the original data (https://github.com/tscheepers/Wikipedia-Summary-Dataset/blob/master/src/process.py)

In [None]:
def file_iterator():
    with open('LLM-Output.txt', 'r') as file:
        for line in file:
            split = line.split("|||")
            title = split[0]
            #join in the rare case this sequenze occurs more than once
            text = ''.join(map(str, split[1:]))
            yield (title, text)

def preprocess_LLM_output():
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    punctiation_words = ['.', ',', ';', ':', '(', ')', '`', '\'', '\'\'', '-', '–', '—', '…', '[', ']', '...', '{', '}']
    forbidden_words = ['']
    punctiation_chars = ['`']
    df = {'id': [], 'title': [], 'text': [] }

    counter = 0
    for (title, text) in file_iterator():

        #Tokenize
        title = word_tokenize(title)
        text = word_tokenize(text)

        #convert words to lowercase
        title = [t.lower() for t in title]
        text = [t.lower() for t in text]

        #remove punctuation
        title = [t for t in title if t not in punctiation_words]
        text = [t for t in text if t not in punctiation_words]

        #remove punctuation chars
        title = [''.join(c for c in t if c not in punctiation_chars) for t in title]
        text = [''.join(c for c in t if c not in punctiation_chars) for t in text]

        #remove forbidden words
        title = [t for t in title if t not in forbidden_words]
        text = [t for t in text if t not in forbidden_words]

        #remove stopwords
        title = [t for t in title if t not in stopwords_english]
        text = [t for t in text if t not in stopwords_english]

        #stemm
        title = [stemmer.stem(t) for t in title]
        text = [stemmer.stem(t) for t in text]

        counter += 1
        
        df['id'].append(counter)
        df['title'].append(title)
        df['text'].append(text)



Retriveing the first X documents using BM25. This is used for efficiently eliminating unreleveant documents. The resulting documaents are later reranked using more computaion intensive but also more acurate methods.

In [None]:
class BM25:
    def __init__(self, dataframe):
        self.dataframe = dataframe
        self.k1 = 1.5
        self.b = 0.75
        self.inverted_index = self.calc_inverted_index()
        self.bm25 = self.calc_bm25_matrix()


    def calc_inverted_index(self) -> dict[str, list[str]]:

        inverted_index = {}
        
        for _, row in self.dataframe.iterrows():
            for word in row['text']+row['title']:
                if word not in inverted_index.keys():
                    inverted_index[word] = []
                    inverted_index[word].append(row['id'])
                    
                else:
                    if row['doc_id'] not in inverted_index[word]:
                        inverted_index[word].append(row['id'])

        return inverted_index


    def calc_bm25_matrix(self):
            
        bm25 = {}     

        #construct vector for every word
        average_document_length = self.average_document_length()
        for _, row in self.dataframe.iterrows():
            bm25[row['id']] = {}
            document_unique = np.unique(row['text']+row['title'])
            document = row['text']+row['title']
            documnet_id = row['id']
            for word in document_unique:
                idf = self.calculate_idf(len(self.inverted_index[word]))
                self.calc_bm25(word, bm25, idf, average_document_length, document, documnet_id)

        return bm25

    def calc_bm25(self, word, bm25, idf, average_document_length, document, documnet_id):
        
        term_frequency = document.count(word)
        
        numerator = idf * term_frequency * (self.k1 + 1)
        denominator = term_frequency + self.k1 * (1 - self.b + ((self.b * len(document)) / average_document_length))
        bm25[documnet_id][word] = numerator / denominator

    def average_document_length(self):
        element_counter = 0
        word_counter = 0
        for _, row in self.dataframe.iterrows():
            element_counter += 1
            word_counter += len(row['text'])+row['title']

        return word_counter / element_counter
    
    def calculate_idf(self, amount_documents_including_word):
        return np.log(len(self.dataframe) / amount_documents_including_word)
    
    def preprocess_query(self, query: str):
        stemmer = PorterStemmer()
        stopwords_english = stopwords.words('english')
        punctiation_words = ['.', ',', ';', ':', '(', ')', '`', '\'', '\'\'', '-', '–', '—', '…', '[', ']', '...', '{', '}']
        forbidden_words = ['']
        punctiation_chars = ['`']

        #Tokenize
        query = word_tokenize(query)

        #convert words to lowercase
        query = [t.lower() for t in query]

        #remove punctuation
        query = [t for t in query if t not in punctiation_words]

        #remove punctuation chars
        query = [''.join(c for c in t if c not in punctiation_chars) for t in query]

        #remove forbidden words
        query = [t for t in query if t not in forbidden_words]

        #remove stopwords
        query = [t for t in query if t not in stopwords_english]

        #stemm
        query = [stemmer.stem(t) for t in query]

        return query


    def retrieve_relevance(self, query: str, k: int) -> dict[str, float]:
        ### ADD YOUR CODE (BEGIN) ###

        query = self.preprocess_query(query)

        #filter out word not contained in any document for efficiency
        query = [word for word in query if word in self.inverted_index.keys()]

        bm25_current_query = {}

        for _, row in self.dataframe.iterrows():
            bm25_current_query[row['id']] = []
            doc_id = row['id']
            for word in query:
                if word in self.bm25[doc_id]:
                    bm25_current_query[doc_id].append(self.bm25[doc_id][word])
                else: 
                    bm25_current_query[doc_id].append(0)

        for key, value in bm25_current_query.items():
            bm25_current_query[key] = sum(value)

        sorted_bm25 = sorted(bm25_current_query.items(), key=lambda x:x[1], reverse=True)

        sorted_bm25 = sorted_bm25[:k]
        bm25_current_query = dict(sorted_bm25)

        return bm25_current_query


---
# merge llm output and documents

In [None]:
def merge_files(file1_path, file2_path, merged_file_path):
    try:
        with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
            content1 = file1.read()
            content2 = file2.read()

            merged_content = content1 + content2
            
        with open(merged_file_path, 'w') as merged_file:
            merged_file.write(merged_content)

    except Exception as e:
        print(f'Error: {e}')

In [None]:
merge_files('llmDocs.txt', './data/topic.txt', 'mixedDocs.txt')

---
# Rerank using MonoBert

In [None]:
# download model

model_name = "castorini/monobert-large-msmarco"
model_path = "./model"

model = AutoModelForSequenceClassification.from_pretrained(model_name)
model.save_pretrained(model_path)

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.save_pretrained(model_path)

In [None]:
model_path = "./model"

local_model = AutoModelForSequenceClassification.from_pretrained(model_path)
local_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
def get_relevance_bert(query, documents):
    relevances = []
    for document in documents:

        inputs = tokenizer(query, document, return_tensors="pt", padding=True, truncation=True)

        # predictions    
        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits

        # e.g.:
        # outputs: SequenceClassifierOutput(loss=None, logits=tensor([[-1.8822,  2.8848]]), hidden_states=None, attentions=None)
        # logits: tensor([[-1.8822,  2.8848]])

        # re-scale to [0,1] and so that sum == 1
        relevance = torch.nn.functional.softmax(logits, dim=1)
        relevances.append([document, relevance[0].tolist()[1]])

    return relevances

In [None]:
def order_relevance(relevances):
    relevances = sorted(relevances, reverse=True, key=lambda x: x[1])
    return relevances

In [None]:
query = "What is the capital of France?"
documents = ["Paris is the capital of France.",
             "birds like France.",
             "snakes eat gras",
             "Graz is the capital of France.",
             "France is capital paris",
             "paris is capital france",
             "France capital paris egg"]

relevances = get_relevance_bert(query, documents)
relevances = order_relevance(relevances)

for relevance in relevances:
    print(relevance)