In [1]:
import gdown

output = "fandom_wiki_articles.zip"
file_id = '1DzYpGhXKLy9mq5gaupnYtIrnIC3uqMep'

gdown.download('https://drive.google.com/uc?id='+file_id, quiet=False)

Downloading...
From: https://drive.google.com/uc?id=1DzYpGhXKLy9mq5gaupnYtIrnIC3uqMep
To: /content/articles.zip
100%|██████████| 10.8M/10.8M [00:01<00:00, 8.09MB/s]


'articles.zip'

In [2]:
!jar xvf ./articles.zip

  created: articles/
 inflated: articles/Ivo_Kristof.txt
 inflated: articles/Shagga.txt
 inflated: articles/Greyguard.txt
 inflated: articles/Whore.txt
 inflated: articles/Helen_Sadler.txt
 inflated: articles/Stark_guard_1_(A_Man_Without_Honor).txt
 inflated: articles/Robert_Hladik.txt
 inflated: articles/Eddard_and_Catelyn_Stark.txt
 inflated: articles/Forrester-Whitehill_conflict.txt
 inflated: articles/House_Greyjoy_(Histories_%26_Lore).txt
 inflated: articles/Ruth_Jenkins.txt
 inflated: articles/Meraxes.txt
 inflated: articles/Symon_Hollard.txt
 inflated: articles/Carice_van_Houten.txt
 inflated: articles/House_Tarbeck.txt
 inflated: articles/Jack_Olohan.txt
 inflated: articles/Liang_Yang.txt
 inflated: articles/Knight_of_the_Gate.txt
 inflated: articles/The_Stormlands.txt
 inflated: articles/The_Kingsmoot.txt
 inflated: articles/Derwa_Frey.txt
 inflated: articles/Nick_Boulton.txt
 inflated: articles/Sheila_Atim.txt
 inflated: articles/Northern_Allegiances_to_House_Stark.txt
 infla

In [1]:
import os
import urllib

titles = []
articles = []

i = 0

for filename in os.listdir('./articles'):
    if not filename[-3:] == 'txt':
        continue
    with open('articles/' + filename, 'rb') as f:
        title = urllib.parse.unquote(filename[:-4])
        title = title.replace('_', ' ')

        if (len(title) == 0) or (len(title.strip()) == 0):
            print("Empty title for", filename)
            continue

        titles.append(title)
        articles.append(f.read().decode('utf-8'))

    i += 1
    if(i % 500) == 0:
        print(' Processed {:,}'.format(i))

print('   DONE. \n')
print('There are {:,} articles.'.format(len(articles)))


Empty title for .txt
 Processed 500
 Processed 1,000
 Processed 1,500
 Processed 2,000
 Processed 2,500
 Processed 3,000
 Processed 3,500
 Processed 4,000
 Processed 4,500
   DONE. 

There are 4,882 articles.


In [2]:
passage_titles = []
passages = []

print('Splitting...')

for i in range(len(titles)):
    title = titles[i]
    article = articles[i]

    if len(article) == 0:
        print('Skipping empty article:', title)
        continue
    words = article.split()

    for i in range(0, len(words), 100):
        chunk_words = words[i:i+100]
        chunk = ' '.join(chunk_words)
        chunk = chunk.strip()

        if len(chunk) == 0:
            continue
        passage_titles.append(title)
        passages.append(chunk)

chunked_corpus = {'title': passage_titles, 'text': passages}


Splitting...


In [3]:
from transformers import DPRContextEncoderTokenizerFast

ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokenizerFast'.


In [4]:
num_passags = len(chunked_corpus['title'])
print('Tokenizing {:,} passages for DPR...'.format(num_passags))

outputs = ctx_tokenizer(
    chunked_corpus['title'],
    chunked_corpus['text'],
    truncation=True,
    padding="longest",
    return_tensors='pt'
)

print('  DONE.')

input_ids = outputs['input_ids']

Tokenizing 43,151 passages for DPR...
  DONE.


In [5]:
import torch

if torch.cuda.is_available():
    print("GPU available")
    device = torch.device("cuda")
else:
    print("No GPU available")
    device = torch.device("cpu")

GPU available


In [6]:
from transformers import DPRContextEncoder

ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-multiset-base")

ctx_encoder = ctx_encoder.to(device=device)

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-multiset-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
import time
import datetime

def format_time(elapsed):
    elapsed_rounded = int(round(elapsed))
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [8]:
import time
import math

torch.set_grad_enabled(False)

t0 = time.time()
step = 0
batch_size = 16

num_passages = input_ids.size()[0]
num_batches = math.ceil(num_passags/batch_size)

embeds_batches = []

print("Generating embeddings for {:,} passages...".format(num_passages))

for i in range(0, num_passages, batch_size):
    if step % 100 == 0 and not step == 0:
        elapsed = format_time(time.time() - t0)
        print("  Batch {:>5,} of {:>5,}.  Elapsed: {:}.".format(step, num_batches, elapsed))

    batch_ids = input_ids[i:i+16,:]
    batch_ids = batch_ids.to(device)

    outputs = ctx_encoder(
        batch_ids,
        return_dict=True
    )
    embeddings = outputs["pooler_output"]
    embeddings = embeddings.detach().cpu().numpy()

    embeds_batches.append(embeddings)
    step += 1

print("  Done.")

Generating embeddings for 43,151 passages...
  Batch   100 of 2,697.  Elapsed: 0:00:31.
  Batch   200 of 2,697.  Elapsed: 0:01:00.
  Batch   300 of 2,697.  Elapsed: 0:01:31.
  Batch   400 of 2,697.  Elapsed: 0:02:01.
  Batch   500 of 2,697.  Elapsed: 0:02:33.
  Batch   600 of 2,697.  Elapsed: 0:03:05.
  Batch   700 of 2,697.  Elapsed: 0:03:37.
  Batch   800 of 2,697.  Elapsed: 0:04:09.
  Batch   900 of 2,697.  Elapsed: 0:04:42.
  Batch 1,000 of 2,697.  Elapsed: 0:05:14.
  Batch 1,100 of 2,697.  Elapsed: 0:05:47.
  Batch 1,200 of 2,697.  Elapsed: 0:06:20.
  Batch 1,300 of 2,697.  Elapsed: 0:06:53.
  Batch 1,400 of 2,697.  Elapsed: 0:07:27.
  Batch 1,500 of 2,697.  Elapsed: 0:08:00.
  Batch 1,600 of 2,697.  Elapsed: 0:08:33.
  Batch 1,700 of 2,697.  Elapsed: 0:09:07.
  Batch 1,800 of 2,697.  Elapsed: 0:09:40.
  Batch 1,900 of 2,697.  Elapsed: 0:10:13.
  Batch 2,000 of 2,697.  Elapsed: 0:10:47.
  Batch 2,100 of 2,697.  Elapsed: 0:11:20.
  Batch 2,200 of 2,697.  Elapsed: 0:11:54.
  Batch 2

In [9]:
import numpy as np

embeddings = np.concatenate(embeds_batches, axis=0)
print("Size of dataset embeddings:", embeddings.shape)

Size of dataset embeddings: (43151, 768)


In [10]:
import faiss

dim = 768

m = 128

index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)

In [11]:
print("Building the FAISS Index")

t0 = time.time()

index.train(embeddings)
index.add(embeddings)

print(" DONE.")

Building the FAISS Index
 DONE.


In [12]:
from transformers import DPRQuestionEncoder

q_encoder = DPRQuestionEncoder.from_pretrained("facebook/dpr-question_encoder-multiset-base")

q_encoder = q_encoder.to(device=device)

Some weights of the model checkpoint at facebook/dpr-question_encoder-multiset-base were not used when initializing DPRQuestionEncoder: ['question_encoder.bert_model.pooler.dense.bias', 'question_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRQuestionEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRQuestionEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
from transformers import DPRQuestionEncoderTokenizerFast

q_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained("facebook/dpr-question_encoder-multiset-base")

In [14]:
input_ids = q_tokenizer.encode("Who is Arya's father?", return_tensors="pt")

input_ids = input_ids.to(device)

outputs = q_encoder(input_ids)

q_embed = outputs["pooler_output"]
q_embed = q_embed.cpu().numpy()

print("Query embedding:", q_embed.shape)

Query embedding: (1, 768)


In [15]:
D, I = index.search(q_embed, k=3)

print("Closest matching indeces:", I)
print("Inner Products:", D)

Closest matching indeces: [[14093 24289 24286]]
Inner Products: [[81.73514  81.42131  80.646416]]


In [16]:
import textwrap

wrapper = textwrap.TextWrapper(width=80)

for i in I[0]:
  title = chunked_corpus['title'][i]
  passage = chunked_corpus['text'][i]

  print('Article Title:   ', title, '\n')
  print('Passage:')
  print(wrapper.fill(passage))

  print('')

Article Title:    Arya Stark and Gendry Baratheon 

Passage:
later in life. However, Arya, given her tomboyish personality, disliked the
expectations and restrictions of her gender, which caused her to frequently
clash with her family, primarily with her mother and her sister Sansa. Gendry is
an unacknowledged bastard son of King Robert Baratheon and a tavern wench. He
lived in Flea Bottom. His mother died when he was young, and he grew up with no
knowledge of his lineage or noble relations. He was not raised under the
expectations of nobility, including arranged marriage. Gendry lived modestly,
focusing mainly on his apprenticeship as a blacksmith under the mentorship of

Article Title:    Arya Stark 

Passage:
world end. == Contents == == Biography == === Background === Arya Stark is the
youngest daughter and third child of Lady Catelyn and Lord Ned Stark. Arya was
born and raised at Winterfell. She has an older sister, Sansa, an older brother
Robb, two younger brothers Bran and Rick

In [17]:
 from datasets import Dataset
 import pandas as pd

 df = pd.DataFrame(chunked_corpus)
 dataset = Dataset.from_pandas(df)

 print(dataset)

Dataset({
    features: ['title', 'text'],
    num_rows: 43151
})


In [18]:
embs = []

for i in range(embeddings.shape[0]):
  embs.append(embeddings[i, :])

In [19]:
dataset = dataset.add_column("embeddings", embs)

dataset

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 43151
})

In [20]:
index = faiss.IndexHNSWFlat(dim, m, faiss.METRIC_INNER_PRODUCT)
dataset.add_faiss_index(column="embeddings", index_name="embeddings", custom_index=index, faiss_verbose=True)

  0%|          | 0/44 [00:00<?, ?it/s]

Dataset({
    features: ['title', 'text', 'embeddings'],
    num_rows: 43151
})

In [21]:
from transformers import RagRetriever

In [24]:
retriever = RagRetriever.from_pretrained(
    "facebook/rag-sequence-nq",
    use_dummy_dataset=False,
    indexed_dataset=dataset,
    index_name="embeddings"
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [25]:
print("hello")

hello


In [26]:
from transformers import RagTokenizer

tokenizer = RagTokenizer.from_pretrained(
    "facebook/rag-sequence-nq"
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [29]:
from transformers import RagSequenceForGeneration

model = RagSequenceForGeneration.from_pretrained(
    "facebook/rag-sequence-nq",
    retriever=retriever
)

Downloading pytorch_model.bin:   0%|          | 0.00/2.06G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/rag-sequence-nq were not used when initializing RagSequenceForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.weight', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing RagSequenceForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagSequenceForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [31]:
import time

t0 = time.time()

question = "Who is Arya's Father?"

input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]

# Give the question to RAG and have it generate an answer
generated = model.generate(input_ids)

generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

print("Q: " + question)
print("A: " + generated_string)

print('\nResponse took %.2f seconds' % (time.time()-t0))



Q: Who is Arya's Father?
A:  lord eddard stark

Response took 233.50 seconds


In [32]:
def ask_question(question):
  t0 = time.time()

  input_ids = tokenizer.question_encoder(question, return_tensors="pt")["input_ids"]
  generated = model.generate(input_ids)
  generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]

  print("Q:" + question)
  print("A: '{:}".format(generated_string))

  print('\nResponse took %.2f seconds' % (time.time()-t0))
