In [2]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! GPU is ready to use.")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. Us")


CUDA is available! GPU is ready to use.
GPU Name: NVIDIA GeForce GTX 1070


In [3]:
import pandas as pd
import json
import os
import torch
import faiss
from transformers import DPRContextEncoderTokenizer, DPRQuestionEncoderTokenizer, DPRQuestionEncoder, DPRContextEncoder
import faiss
import numpy as np


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
with open(r"C:\Users\tom_r\Downloads\triviaqa-rc\qa\wikipedia-train.json", 'r') as f:
    data = json.load(f)

print(data.keys()) 
print(data['Data'][0]) # look at some examples
print(data['Data'][1]) 

dict_keys(['Data', 'Domain', 'Split', 'VerifiedEval', 'Version'])
{'Answer': {'Aliases': ['Park Grove (1895)', 'York UA', 'Yorkish', 'UN/LOCODE:GBYRK', 'York, UK', 'Eoforwic', 'Park Grove School', 'York Ham', 'The weather in York', 'City of York', 'York, England', 'York, Yorkshire', 'York ham', 'County Borough of York', 'YORK', 'Eoferwic', 'Park Grove Primary School', 'York, North Yorkshire', 'Yoisk', 'York', 'York (England)'], 'MatchedWikiEntityName': 'York', 'NormalizedAliases': ['york yorkshire', 'eoferwic', 'park grove primary school', 'park grove school', 'weather in york', 'park grove 1895', 'eoforwic', 'county borough of york', 'york uk', 'un locode gbyrk', 'city of york', 'york england', 'york ua', 'york ham', 'york', 'yorkish', 'yoisk', 'york north yorkshire'], 'NormalizedMatchedWikiEntityName': 'york', 'NormalizedValue': 'york', 'Type': 'WikipediaEntity', 'Value': 'York'}, 'EntityPages': [{'DocSource': 'TagMe', 'Filename': 'England.txt', 'Title': 'England'}, {'DocSource': 'Ta

In [5]:
print(len(data['Data']))

61888


In [6]:
# extracting relevant fields

def load_evidence(filename):
    with open(fr'C:\Users\tom_r\Downloads\triviaqa-rc\evidence\wikipedia\{filename}', 'r') as f:
        return f.read()

questions = []
answers = []
evidence = []

for item in data['Data']:

    try:
        
        evidence_docs = []
        for page in item['EntityPages']:
            evidence_docs.append(load_evidence(page['Filename']))
        evidence.append(evidence_docs)
        
        questions.append(item['Question'])
        answers.append(item['Answer'])

        
    except:
        pass

In [7]:
print(len(questions))
print(len(answers))
print(len(evidence))

print(questions[1])
print(answers[1])
print(evidence[1])

print(questions[5])
print(answers[5])
print(evidence[5])

print(questions[20000])
print(answers[20000])
print(evidence[20000])

61599
61599
61599
From which country did Angola achieve independence in 1975?
{'Aliases': ['Portoga≈Ço', 'Republic of Portugal', 'PORTUGAL', 'Portekiz', 'Portugallu', 'O Papagaio', 'ISO 3166-1:PT', 'Portunga', 'Phu-to-ga', 'Potigal', 'Port√ªnga', 'Portugul', 'An Phortaing√©il', 'PortugƒÅle', 'Portugale', 'Portingale', 'Potiti', 'Portugali', 'Portugall', 'Portek√Æz', 'Bo Dao Nha', 'Portuguese Republic', 'Portogallo', 'Portugaul', 'Portogalo', 'Portyngal', 'Yn Phortiugal', 'Portugalio', 'Portug√°l', 'Portugual', 'Portuga', 'Portgual', 'Portugalsko', 'Portugaleje', 'Ph√ª-t√¥-g√¢', 'Portugalujo', 'Portugalija', 'Pertual', 'P√≤tigal', 'Portugal', 'B·ªì ƒê√†o Nha', 'Portugalska', 'Rep√∫blica Portuguesa', 'Portiwgal', 'Portugalƒójƒó', 'Port√∫gal', 'Portegal', 'An Phortaingeil', 'Republica Portuguesa'], 'MatchedWikiEntityName': 'Portugal', 'NormalizedAliases': ['portugul', 'portugallu', 'portugalska', 'p√≤tigal', 'portugaul', 'portugalujo', 'portuguese republic', 'iso 3166 1 pt', 'republic of 

In [8]:
subset = {}
subset['Questions'] = questions[:2000]
subset['Answers'] = answers[:2000]
subset['Evidence'] = evidence[:2000]

Why split the evidence? 

- Efficient retrieval
- Improved relevance
- Better computation efficiency
- Better context for generator

In [9]:
# split evidence up into paragraphs to increase granulatty whilst trying to maintain some context. 
example_evidence = subset['Evidence'][10][0]
paras = example_evidence.split("\n\n")
paras

['Europe is a continent that comprises the westernmost part of Eurasia. Europe is bordered by the Arctic Ocean to the north, the Atlantic Ocean to the west, and the Mediterranean Sea to the south. To the east and southeast, Europe is generally considered as separated from Asia by the watershed divides of the Ural and Caucasus Mountains, the Ural River, the Caspian and Black Seas, and the waterways of the Turkish Straits.  Yet the non-oceanic borders of Europe‚Äîa concept dating back to classical antiquity‚Äîare arbitrary; the primarily physiographic term "continent" as applied to Europe also incorporates cultural and political elements whose discontinuities are not always reflected by the continent\'s current boundaries.',
 "Europe is the world's second-smallest continent by surface area, covering about  or 2% of the Earth's surface and about 6.8% of its land area. Of Europe's approximately 50 countries, Russia is the largest and most populous, spanning 39% of the continent and compris

In [10]:
# split the remaining dataset
subset['Questions'] = questions[:2000]
subset['Answers'] = answers[:2000]
subset['Evidence'] = [x.split("\n\n") for x in subset['Evidence'][0]]

# flatten the paragraphs
paragraphs = [para for doc in subset['Evidence'] for para in doc]

In [19]:
# Fix OpenMP conflict (temporary workaround)
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'


# flatten the paragraphs
paragraphs = [para for doc in subset['Evidence'] for para in doc]

# Load retriever models
context_tokenizer = DPRContextEncoderTokenizer.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")
question_tokenizer = DPRQuestionEncoderTokenizer.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
question_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-single-nq-base")
context_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

def encode_paragraphs(paragraphs, batch_size=16):
    all_embeddings = []
    for i in range(0, len(paragraphs), batch_size):
        batch = paragraphs[i:i + batch_size]
        inputs = context_tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            embeddings = context_encoder(**inputs).pooler_output
        all_embeddings.append(embeddings.numpy())
    return np.vstack(all_embeddings)

# encode the paragraphs and build FAISS
document_embeddings = encode_paragraphs(paragraphs)

# normalise embeddings before adding to FAISS
document_embeddings = document_embeddings / np.linalg.norm(document_embeddings, axis=1, keepdims=True)


index = faiss.IndexFlatIP(document_embeddings.shape[1])
index.add(document_embeddings)

print("Embeddings shape:", document_embeddings.shape) 
print("FAISS index size:", index.ntotal) 



The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizer'.
Some weights of the model checkpoint at facebook/dpr-question_encoder-single-nq-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequence

Embeddings shape: (228, 768)
FAISS index size: 228


In [22]:

# Encode the query
query = "From which country did Angola achieve independence in 1975?"
query_inputs = question_tokenizer(query, return_tensors="pt")

with torch.no_grad():
    query_embedding = question_encoder(**query_inputs).pooler_output.numpy()

# normalise the query
query_embedding = query_embedding / np.linalg.norm(query_embedding)

# Search the FAISS index
distances, indices = index.search(query_embedding, k=5)
retrieved_paragraphs = [paragraphs[i] for i in indices[0]]
print("Retrieved Paragraphs:", retrieved_paragraphs)


Retrieved Paragraphs: ["Competing with Spain, the first English colony in the Americas was founded in 1585 by explorer Walter Raleigh in Virginia and named Roanoke. The Roanoke colony failed and is known as the lost colony, after it was found abandoned on the return of the late-arriving supply ship.  With the East India Company, England also competed with the Dutch and French in the East. In 1588, during the Elizabethan period, an English fleet under Francis Drake defeated an invading Spanish Armada. The political structure of the island changed in 1603, when the King of Scots, James VI, a kingdom which was a longtime rival to English interests, inherited the throne of England as James I ‚Äî creating a personal union.   He styled himself King of Great Britain, although this had no basis in English law.  Under the auspices of King James VI and I the Authorised King James Version of the Holy Bible was published in 1611. It has not only been ranked with Shakespeare's works as the greatest

In [23]:
print("Top 5 Retrieved Paragraphs:")
for rank, para in enumerate(retrieved_paragraphs, 1):
    print(f"\nRank {rank}: {para[:300]}...")  # Print the first 300 characters 


Top 5 Retrieved Paragraphs:

Rank 1: Competing with Spain, the first English colony in the Americas was founded in 1585 by explorer Walter Raleigh in Virginia and named Roanoke. The Roanoke colony failed and is known as the lost colony, after it was found abandoned on the return of the late-arriving supply ship.  With the East India Co...

Rank 2: The Kingdom of England‚Äîwhich after 1535 included Wales‚Äîceased being a separate sovereign state on 1 May 1707, when the Acts of Union put into effect the terms agreed in the Treaty of Union the previous year, resulting in a political union with the Kingdom of Scotland to create the Kingdom of Great B...

Rank 3: Subsequently the House of Plantagenet from Anjou inherited the English throne under Henry II, adding England to the budding Angevin Empire of fiefs the family had inherited in France including Aquitaine.  They reigned for three centuries, some noted monarchs being Richard I, Edward I, Edward III and...

Rank 4: History...

Rank 5: 