In [1]:
import langchain

In [2]:
import os
os.environ['HF_HOME'] = '/mnt/scratch/tanishq'

In [3]:
def word_wrap(string, n_chars=72):
    # Wrap a string at the next space after n_chars
    if len(string) < n_chars:
        return string
    else:
        return string[:n_chars].rsplit(' ', 1)[0] + '\n' + word_wrap(string[len(string[:n_chars].rsplit(' ', 1)[0])+1:], n_chars)

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter

In [5]:
def prepare_data(data):
    """
    Extracting context, question, and answers from the dataset
    """
    articles = []
    
    for article in data["data"]:
        for paragraph in article["paragraphs"]:
            for qa in paragraph["qas"]:
                question = qa["question"]

                if not qa["is_impossible"]:
                    answer = qa["answers"][0]["text"]
                else:
#                     print(qa)
                    if not qa["plausible_answers"]:
                        answer = "no answer"
                    else:
                        answer = qa["plausible_answers"][0]['text']
                
                inputs = {"context": paragraph["context"], "question": question, "answer": answer}

            
                articles.append(inputs)

    return articles

In [6]:
import json
import pandas as pd
# Loading the data

with open('train-v2.0.json') as f:
    data_train = json.load(f)
    
with open('dev-v2.0.json') as f:
    data = json.load(f)

In [7]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

In [8]:
data = prepare_data(data)

# Create a Dataframe
data = pd.DataFrame(data)

In [9]:
data_train = prepare_data(data_train)

data_train = pd.DataFrame(data_train)

In [23]:
temp = []

# Iterate over each row of the DataFrame
for index, row in data.iterrows():
    temp.append(row['context'])

# for index, row in data_train.iterrows():
#     temp.append(row['context'])
    
temp_unique = list(set(temp))
len(temp_unique)

1204

In [24]:
token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, 
                                                       model_name= 'sentence-transformers/multi-qa-mpnet-base-dot-v1',
                                                       tokens_per_chunk=512)

token_split_texts = []
for text in temp_unique:
    token_split_texts += token_splitter.split_text(text)

print(word_wrap(token_split_texts[10]))
print(f"\nTotal chunks: {len(token_split_texts)}")

in 2010 a salary survey revealed the differences in remuneration
between different roles, sectors and locations in the construction and
built environment industry. the results showed that areas of
particularly strong growth in the construction industry, such as the
middle east, yield higher average salaries than in the uk for example.
the average earning for a professional in the construction industry in
the middle east, across all sectors, job types and levels of
experience, is £42, 090, compared to £26, 719 in the uk. this trend is
not necessarily due to the fact that more affluent roles are available,
however, as architects with 14 or more years experience working in the
middle east earn on average £43, 389 per annum, compared to £40, 000 in
the uk. some construction workers in the us / canada have made more
than $ 100, 000 annually, depending on their trade.

Total chunks: 1212


In [25]:
import chromadb
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

embedding_function = SentenceTransformerEmbeddingFunction(model_name="multi-qa-mpnet-base-dot-v1") # Best is all-distilroberta-v1
# print(embedding_function([token_split_texts[10]]))

In [26]:
chroma_client = chromadb.Client()
chroma_collection = chroma_client.create_collection("contexts", embedding_function=embedding_function)

ids = [str(i) for i in range(len(token_split_texts))]

chroma_collection.add(ids=ids, documents=token_split_texts)
chroma_collection.count()

Using embedded DuckDB without persistence: data will be transient


1212

# Evaluation

## With Ranking

In [22]:
query = "What century did the Normans first gain their separate identity?"
prompt = f'''<s>[INST] You are a helpful question answering assistant. Provide an example answer to the given question, that can be found on Wikipedia. 
So for instance if the question is -
"In what country is Normandy located?"
The example answer should be
[/INST]
Normandy is a region in northwestern France. It is not a country, but rather a historical and cultural region that has been part of France since the 13th century. Normandy is known for its beautiful coastline, rolling hills, and dairy farming, as well as its rich history and cultural heritage. Some famous products associated with Normandy include apples, cheese (such as Camembert and Pont-l'Évêque), cider, and Calvados
</s>
[INST]{query}[/INST]'''

In [23]:
prompt

'<s>[INST] You are a helpful question answering assistant. Provide an example answer to the given question, that can be found on Wikipedia. \nSo for instance if the question is -\n"In what country is Normandy located?"\nThe example answer should be\n[/INST]\nNormandy is a region in northwestern France. It is not a country, but rather a historical and cultural region that has been part of France since the 13th century. Normandy is known for its beautiful coastline, rolling hills, and dairy farming, as well as its rich history and cultural heritage. Some famous products associated with Normandy include apples, cheese (such as Camembert and Pont-l\'Évêque), cider, and Calvados\n</s>\n[INST]What century did the Normans first gain their separate identity?[/INST]'

In [14]:
import os
from transformers import pipeline
import transformers
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, pipeline

query_list = list(data['question'])
os.environ["HF_HOME"] = "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"

def augment_query_generated(query, generator):

    prompt = f'''[INST] You are a helpful question answering assistant. Provide an example answer to the given question, related to one of the following topics - Normans, Computational_complexity_theory, Southern_California, Sky_(United_Kingdom), Victoria_(Australia), Huguenot, Steam_engine, Oxygen, 1973_oil_crisis, European_Union_law, Amazon_rainforest, Ctenophora, Fresno,_California, Packet_switching, Black_Death, Geology, Pharmacy, Civil_disobedience, Construction, Private_school, Harvard_University, Jacksonville,_Florida, Economic_inequality, University_of_Chicago, Yuan_dynasty, Immune_system, Intergovernmental_Panel_on_Climate_Change, Prime_number, Rhine, Scottish_Parliament, Islamism, Imperialism, Warsaw, French_and_Indian_War, Force. 
    Question : {query} [/INST]'''
    augmented_query = generator(prompt, max_length=400, num_return_sequences=1, truncation=True)[0]['generated_text']
    
    return augmented_query

In [15]:
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", device="cuda:0")
tokenizer.pad_token = tokenizer.eos_token

generator = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.2", tokenizer=tokenizer, device="cuda:0")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
#### data["Generated_Passage1"] = ' '

# data["Found"] = " "

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

found = 0
not_found = 0
for i in range(len(query_list[:1000])):
    # query = "In what country is Normandy located?"
    query = query_list[i]
#     print(query)
    
    augmented_query = augment_query_generated(query, generator)
    print(i)
#     print("QUERY -", augmented_query)
    
    index = augmented_query.find('[/INST]')
    # Get the substring after '[/INST]'
    answer = augmented_query[index + len('[/INST]'):].strip()
    
    answer_input = query + answer
#     print("Input", answer_input)

    num_passages = 10

    results = chroma_collection.query(query_texts=[answer_input], n_results=num_passages)
    retrieved_documents = results['documents'][0]
    
#     print("Retrieved Documents")
#     for l in retrieved_documents:
#         print(l, '\n')

    pairs = [[answer_input, doc] for doc in retrieved_documents]
    scores = cross_encoder.predict(pairs)
#     print("SCORES -", scores)
    
    ordered_passage = []
    for o in np.argsort(scores)[::-1]:
        ordered_passage.append(retrieved_documents[o])

    # Gold passage
    gold_passage = data['context'][i]
#     print("GOLD PASSAGE -",gold_passage)
    
    top = 1
    
    generated_passages = ordered_passage[:top]
#     print(generated_passages)
    data['Generated_Passage1'][i] = generated_passages[0]

    # Calculate cosine similarity
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([gold_passage] + generated_passages)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    for j, sim in enumerate(cosine_similarities[0]):
      # print(f"Generated Passage {i+1}: {sim}")
        if sim > 0.95:
#             print("Found")
            found += 1
            # data['Found'][i] = "yes"
            # data["Generated_Passage"][i] = generated_passages[j]
            break
        if j == top-1:
#             print("Not Found")
            not_found += 1
            # data['Found'][i] = "no"
            # data["Generated_Passage"][i] = generated_passages[0]


In [28]:
print(not_found, found, len(query_list))

352 648 11873


In [29]:
found/1000

0.648

In [30]:
data

Unnamed: 0,context,question,answer,Generated_Passage1
0,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France,the normans ( norman : nourmands ; french : no...
1,The Normans (Norman: Nourmands; French: Norman...,When were the Normans in Normandy?,10th and 11th centuries,the normans ( norman : nourmands ; french : no...
2,The Normans (Norman: Nourmands; French: Norman...,From which countries did the Norse originate?,"Denmark, Iceland and Norway",the descendants of rollo's vikings and their f...
3,The Normans (Norman: Nourmands; French: Norman...,Who was the Norse leader?,Rollo,the descendants of rollo's vikings and their f...
4,The Normans (Norman: Nourmands; French: Norman...,What century did the Normans first gain their ...,10th century,the normans ( norman : nourmands ; french : no...
...,...,...,...,...
11868,"The pound-force has a metric counterpart, less...",What is the seldom used force unit equal to on...,sthène,
11869,"The pound-force has a metric counterpart, less...",What does not have a metric counterpart?,pound-force,
11870,"The pound-force has a metric counterpart, less...",What is the force exerted by standard gravity ...,kilogram-force,
11871,"The pound-force has a metric counterpart, less...",What force leads to a commonly used unit of mass?,kilogram-force,


In [31]:
data.to_csv('/mnt/scratch/tanishq/df_top1.csv')