In [1]:
import re
import json
import glob
import numpy as np
import pandas as pd
from tqdm import tqdm
from functools import partial
from typing import List, Optional

import torch
import torch.nn as nn 

import faiss

from datasets import (
    Dataset, 
    Features,
    Sequence,
    Value, 
    load_dataset
)
from transformers import (
    DPRContextEncoder,
    DPRContextEncoderTokenizerFast,
    DPRQuestionEncoderTokenizerFast,
    RagRetriever,
    RagTokenForGeneration,
    RagSequenceForGeneration,
    RagTokenizer,
)

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
questions = []
with open("data/questions.txt", "r") as f:
    questions = f.readlines()

questions = [q.strip() for q in questions]

In [3]:
questions

['When did the GARDASIL 9 recommendations change?',
 'What were the past 3 recommendation changes for GARDASIL 9?',
 'Is GARDASIL 9 recommended for Adults?',
 'Does the ACIP recommend one dose GARDASIL 9?']

In [4]:
from PyPDF2 import PdfReader

title = []
data = []
for paper in glob.glob("data/papers_pdf/*.pdf"):
    title.append(paper.split("/")[-1][:-4])
    reader = PdfReader(paper)

    text = []
    for page_idx in range(len(reader.pages)):
        text.append(reader.pages[page_idx].extract_text())
    text = " ".join(text)
    text = re.sub("\n", "", text)
    
    data.append(text)

for idx in range(len(data)):
    with open(f"data/papers_txt/{title[idx]}.txt", "w") as f:
        f.write(data[idx])

In [5]:
title = "" # []
data = "" # []
for path in glob.glob("data/papers_txt/*.txt"):
    # title.append(path.split("/")[-1][:-4])
    title += " " + path.split("/")[-1][:-4]
    with open(f"{path}", "r") as f:
        # data.append(" ".join(f.readlines()))
        data += " ".join(f.readlines())

title = [title]
data = [data]

In [6]:
len(data), len(title)

(1, 1)

In [7]:
def split_text(text: str, n=100, character=" ") -> List[str]:
    """Split the text every ``n``-th occurrence of ``character``"""
    text = text.split(character)
    return [character.join(text[i : i + n]).strip() for i in range(0, len(text), n)]


def split_documents(documents: dict) -> dict:
    """Split documents into passages"""
    titles, texts = [], []
    for title, text in zip(documents["title"], documents["text"]):
        if text is not None:
            for passage in split_text(text):
                titles.append(title if title is not None else "")
                texts.append(passage)
    return {"title": titles, "text": texts}

In [8]:
dataset = Dataset.from_dict({"title": title, "text": data})
dataset

Dataset({
    features: ['title', 'text'],
    num_rows: 1
})

In [9]:
# split the documents into passages of 100 words
dataset = dataset.map(split_documents, batched=True)
dataset

100%|██████████| 1/1 [00:00<00:00, 128.85ba/s]


Dataset({
    features: ['title', 'text'],
    num_rows: 478
})

In [10]:
def embed(documents: dict, ctx_encoder: DPRContextEncoder, ctx_tokenizer: DPRContextEncoderTokenizerFast) -> dict:
    """Compute the DPR embeddings of document passages"""
    input_ids = ctx_tokenizer(documents["title"], documents["text"], truncation=True, padding="longest", return_tensors="pt", is_split_into_words=True, max_length=128)["input_ids"]
    embeddings = ctx_encoder(input_ids.to(device=device), return_dict=True).pooler_output

    return {"embeddings": embeddings.detach().cpu().numpy()}

In [11]:
ctx_encoder = DPRContextEncoder.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base").to(device=device)
ctx_tokenizer = DPRContextEncoderTokenizerFast.from_pretrained("facebook/dpr-ctx_encoder-single-nq-base")

Some weights of the model checkpoint at facebook/dpr-ctx_encoder-single-nq-base were not used when initializing DPRContextEncoder: ['ctx_encoder.bert_model.pooler.dense.bias', 'ctx_encoder.bert_model.pooler.dense.weight']
- This IS expected if you are initializing DPRContextEncoder from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DPRContextEncoder from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DPRQuestionEncoderTokenizer'. 
The class this function is called from is 'DPRContextEncoderTokeniz

In [12]:
new_features = Features(
    {"text": Value("string"), "title": Value("string"), "embeddings": Sequence(Value("float32"))}
)

# compute the embeddings
dataset = dataset.map(
    partial(embed, ctx_encoder=ctx_encoder, ctx_tokenizer=ctx_tokenizer),
    batched=True,
    batch_size=1,
    features=new_features,
)

100%|██████████| 478/478 [00:05<00:00, 94.36ba/s]


In [13]:
dataset

Dataset({
    features: ['text', 'title', 'embeddings'],
    num_rows: 478
})

In [14]:
len(dataset["embeddings"][0])

768

In [15]:
index = faiss.IndexHNSWFlat(768, 128, faiss.METRIC_INNER_PRODUCT)
dataset.add_faiss_index("embeddings", custom_index=index)

100%|██████████| 1/1 [00:00<00:00, 102.95it/s]


Dataset({
    features: ['text', 'title', 'embeddings'],
    num_rows: 478
})

In [16]:
dataset.get_index("embeddings")

<datasets.search.FaissIndex at 0x7f0b53b53da0>

In [17]:
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="custom", indexed_dataset=dataset)
model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever).to(device)

  f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. "
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The t

In [18]:
# question_tokenizer = DPRQuestionEncoderTokenizerFast.from_pretrained("facebook/dpr-question_encoder-single-nq-base")

In [19]:
for question in questions:
    print(f"question: {question}")
    generated = model.generate(tokenizer.question_encoder(question, return_tensors="pt")["input_ids"].to(device))
    generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)[0]
    print(f"answer: {generated_string}") 
    print()

question: When did the GARDASIL 9 recommendations change?
answer:  1992

question: What were the past 3 recommendation changes for GARDASIL 9?
answer:  2003

question: Is GARDASIL 9 recommended for Adults?
answer: 

question: Does the ACIP recommend one dose GARDASIL 9?
answer:  2 - dose schedule



In [27]:
model = AutoModelForCausalLM.from_pretrained("tiiuae/falcon-40b", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-40b")

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Downloading: 100%|██████████| 38.3k/38.3k [00:00<00:00, 154kB/s] 
Downloading: 100%|██████████| 8.85G/8.85G [14:52<00:00, 10.7MB/s] 
Downloading: 100%|██████████| 8.86G/8.86G [14:35<00:00, 10.9MB/s]
Downloading: 100%|██████████| 8.86G/8.86G [13:20<00:00, 11.9MB/s]
Downloading: 100%|██████████| 8.86G/8.86G [13:21<00:00, 11.9MB/s]
Downloading: 100%|██████████| 8.86G/8.86G [13:22<00:00, 11.9MB/s]
Downloading: 100%|██████████| 8.86G/8.86G [13:21<00:00, 11.9MB/s]
Downloading: 100%|██████████| 8.86G/8.86G [13:19<00:00, 11.9MB/s]
Downloading: 100%|██████████| 8.86G/8.86G [13:20<00:00, 11.9MB/s]
Downloading: 100%|██████████| 7.05G/7.05G [10:45<00:00, 11.7MB/s]


: 

: 

In [23]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import transformers
import torch

model = "tiiuae/falcon-7b-instruct"

tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map="auto",
)
sequences = pipeline(
   "Girafatron is obsessed with giraffes, the most glorious animal on the face of this Earth. Giraftron believes all other animals are irrelevant when compared to the glorious majesty of the giraffe.\nDaniel: Hello, Girafatron!\nGirafatron:",
    max_length=200,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
    trust_remote_code=True,
)
for seq in sequences:
    print(f"Result: {seq['generated_text']}")


ValueError: Loading tiiuae/falcon-7b-instruct requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.