# RAG

Build a simple Retrieval-Augmented Generation pipeline to demonstrate its working.

Steps:
1. Use Sentence BERT to create embeddings
2. Document Store: Use in-memory key-value store.
3. Retrieval: Use embeddings from GPT-2
4. Generation: Use GPT-2 for generating a response

In [None]:
# Store the embeddings for the dev set
import torch
from torch.utils.data.dataloader import DataLoader
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer
from bert import BERT
from bert_config import BERTConfig
from rag.snliDataset import snliDataset, snliEmbeddings
from tqdm import tqdm

model = BERT.from_pretrained(config=BERTConfig())
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device="cuda"
model.to(device)
model.eval()

def dynamic_padding(data,device="cuda"):
    s1 = [item["sentence1"] for item in data]
    s2 = [item["sentence2"] for item in data]
    labels = [item["label"] for item in data]
    encoded = tokenizer(s1,s2,padding=True,truncation=True,return_tensors="pt",max_length=512)
    encoded["attention_mask"] = encoded["attention_mask"].bool()
    encoded = {key: tensor.to(device) for key, tensor in encoded.items()}
    return encoded,labels

def prepare_data(split: str, output_filename:str):
    """
    Store the training data in the following as a json file. Format:
    {
        'input_ids': [tokenized input ids for sentence 1, sentence2]
        'embedding': [output of bert for the sentence1, sentence2 input],
        'label': int (0,1,2)
    }
    """
    sd = snliDataset(split)
    batch_size = 64
    dl = DataLoader(sd,batch_size=batch_size,collate_fn=dynamic_padding)    

    data = []
    with torch.no_grad():
        for encoded,labels in tqdm(dl):
            
            output = model(**encoded)
            output = output.cpu()
            embedding = output
            seq_lens = torch.sum(encoded["attention_mask"],dim=1)
            for i in range(len(encoded)):
                
                data_item = {
                    "input_ids": encoded["input_ids"][i][:seq_lens[i]].cpu(),
                    "embedding": embedding[i],
                    "label": labels[i],
                }
                data.append(data_item)

    torch.save({"data": data}, output_filename)
    
    print(f"Wrote {split} data to {output_filename}")

# prepare_data("dev", "dev_data.pt")
# prepare_data("test", "test_data.pt")
prepare_data("train","train_data.pt")

In [None]:
# Build a MLP classifier
from torch import nn
import torch.nn.functional as F
class MLP(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_size,hidden_size)
        self.output_layer = nn.Linear(hidden_size,output_size)
        self.dropout = nn.Dropout(0.1)
    
    def forward(self,x):
        x = self.dropout(F.relu(self.hidden_layer(x)))
        x = self.output_layer(x)
        return x


In [None]:
# Train loop
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from snliDataset import snliEmbeddings

split = "train"
se_train = snliEmbeddings(split=split)


In [None]:
device = "cuda"
mlp = MLP(768,256,3)
mlp.to(device)
criterion = nn.CrossEntropyLoss()
learning_rate = 1e-3
optimizer = torch.optim.Adam(mlp.parameters(),lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=100,gamma=0.5)
n_epochs = 500
train_loader = DataLoader(se_train,batch_size=64,shuffle=True)

for epoch in range(n_epochs):
    mlp.train()
    running_loss = 0.0

    for i,batch in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = mlp(batch["embedding"].to(device))
        loss = criterion(outputs,torch.tensor(batch["label"]).to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    scheduler.step()        
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}")



In [None]:
# Evaluation
mlp.eval()
correct, total = 0, 0
se_test = snliEmbeddings(split="test")
test_loader = DataLoader(se_test,batch_size=32)
with torch.no_grad():
    for batch in test_loader:
        outputs = mlp(batch["embedding"].to(device))
        _, predicted = torch.max(outputs, 1)
        # print(predicted, batch["label"])
        total += batch["label"].size(0)
        correct += (predicted == batch["label"].to(device)).sum().item()
    
print(f"Test Accuracy: {100 * correct / total:.2f}%")

Sentence BERT Training

In [None]:
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from rag.train import Trainer
from rag.snliDataset import sentenceBERTDataset
from bert_config import BERTConfig, BERTTrainConfig

train_set = sentenceBERTDataset("train")
val_set = sentenceBERTDataset("dev")
train_config = BERTTrainConfig()
model_config = BERTConfig()

trainer = Trainer(train_set, val_set,model_config, train_config)

In [None]:
trainer.train()

In [None]:
# Evaluation
import torch
from rag.snliDataset import sentenceBERTDataset
from bert_config import BERTConfig, BERTTrainConfig

from sentenceBERT import sentenceBERT
from torch.utils.data.dataloader import DataLoader
from bert_utils import dynamic_padding

device = "cuda"
test_set = sentenceBERTDataset("test")
ckpt_path = "out/bert_ckpt_train.pt"
ckpt = torch.load(ckpt_path)
model = sentenceBERT(BERTConfig())
model.load_state_dict(ckpt["model"])
model.to(device)

correct, total = 0, 0
test_loader = DataLoader(test_set,batch_size=8,collate_fn=dynamic_padding)
with torch.no_grad():
    for batch in test_loader:
        outputs = model(batch["sentence1"],batch["sentence2"])
        _, predicted = torch.max(outputs, 1)
        total += batch["label"].size(0)
        correct += (predicted == batch["label"].to(device)).sum().item()
    
print(f"Test Accuracy: {100 * correct / total:.2f}%")

## Creating a Knowledge Base

To build out a simple RAG, use a few text documents as the knowledge base.
Run sentence BERT on these documents, on overlapping blocks of text and store the vectors.
Accept a query from the user
Run sentence BERT on the query, retrieve the 3 most relevant embeddings from the knowledge base using cosine distance
Provide the user query, retrieved embeddings to GPT-2 and generate a response.


For the knowledge base, I copied the text about Tour de France from Wikipedia and saved it in a text file.

In [1]:
# Run SBERT on the text document and save the embeddings
from transformers import BertTokenizer
import torch
fpath = "tdf.txt"
text = open(fpath).read()
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
embedding_size = 256
encoded = tokenizer(text,return_tensors="pt")
enc_size = len(encoded.input_ids[0])
input_ids_1 = torch.zeros(enc_size // embedding_size + 1,embedding_size)


zeros = torch.zeros((enc_size//embedding_size+1)*embedding_size-enc_size)
# print(len(zeros))
# print(enc_size)

input_ids_1 = torch.reshape(torch.cat((zeros,encoded.input_ids[0])), (enc_size//embedding_size+1, embedding_size))
attention_1 = torch.reshape(torch.cat((zeros,encoded.attention_mask[0])), (enc_size//embedding_size+1, embedding_size))
token_1 = torch.reshape(torch.cat((zeros,encoded.token_type_ids[0])), (enc_size//embedding_size+1, embedding_size))

input_ids_2 = torch.reshape(torch.cat((encoded.input_ids[0],zeros)), (enc_size//embedding_size+1, embedding_size))
attention_2 = torch.reshape(torch.cat((encoded.attention_mask[0],zeros)), (enc_size//embedding_size+1, embedding_size))
token_2 = torch.reshape(torch.cat((encoded.token_type_ids[0],zeros)), (enc_size//embedding_size+1, embedding_size))

input_ids = torch.hstack((input_ids_1,input_ids_2))
attention_mask = torch.hstack((attention_1,attention_2))
token_type_ids = torch.hstack((token_1,token_2))




Token indices sequence length is longer than the specified maximum sequence length for this model (12775 > 512). Running this sequence through the model will result in indexing errors


In [1]:
import torch
import numpy as np
from sentenceBERT import sentenceBERT
from bert_config import BERTConfig

device = "cuda"
ckpt_path = "out/bert_ckpt_train.pt"
ckpt = torch.load(ckpt_path)
model = sentenceBERT(BERTConfig())
model.load_state_dict(ckpt["model"])
model.to(device)

fpath = "/home/varun/projects/experiments-with-gpt2/rag/tdf.txt"
model.encode(fpath)


  ckpt = torch.load(ckpt_path)


Loading pre-trained weights for bert


Token indices sequence length is longer than the specified maximum sequence length for this model (12775 > 512). Running this sequence through the model will result in indexing errors


In [2]:
import faiss

encode_output = np.load("doc_embeddings.npz")
embeddings = encode_output["embeddings"]
input_ids = encode_output["input_ids"]
attention_mask = encode_output["attention_mask"]

index = faiss.IndexFlatL2(embeddings.shape[1])

# faiss.normalize_L2(embeddings)
index.add(embeddings)


In [2]:

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
query = "Name the three grand tours?"
input = tokenizer(query,return_tensors="pt")
input["attention_mask"] = input["attention_mask"].bool()
input= {key: tensor.to(device) for key, tensor in input.items()}
print(input["input_ids"].size())
with torch.no_grad():
    q_embedding = model.bert(**input).cpu().numpy()
dist, ann = index.search(q_embedding,2)

print(dist, ann)
for n in ann[0]:
    print(tokenizer.decode(input_ids[n,:]))


NameError: name 'device' is not defined

In [1]:
from rag import RAG
import torch
from sentenceBERT import sentenceBERT
from bert_config import BERTConfig
from gpt import GPT
from gpt_config import GPTConfig

device = "cuda"
ckpt_path = "out/bert_ckpt_train.pt"
ckpt = torch.load(ckpt_path)
embedding_model_config = BERTConfig()
embedding_model = sentenceBERT(embedding_model_config)
embedding_model.to(device)

generate_model = GPT.from_pretrained(GPTConfig(block_size=1024))
generate_model.to(device)
embedding_model_size = embedding_model_config.embedding_size
sentence_size = 3
overlap_size = 1
k = 5

rag = RAG(embedding_model,embedding_model_size,generate_model,sentence_size,overlap_size,k)
rag.add_to_knowledge_base(["/home/varun/projects/experiments-with-gpt2/rag/tdf.txt"])


  ckpt = torch.load(ckpt_path)


Loading pre-trained weights for bert
Loading pre-trained weights for gpt2
Number of parameters: 123.65M


In [2]:
query = "Who won the tour in 2024?"
response = rag.get_response(query)
print(response)

Query: Who won the tour in 2024?, Response:  20:59: Kim Racis has won the


In [4]:
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")
from gpt import GPT
from gpt_config import GPTConfig
import torch
torch.manual_seed(23)
device="cuda"
model = GPT.from_pretrained(GPTConfig(block_size=1024))
model.to(device)

print(model.generate("A long time ago", max_new_tokens=15))

Loading pre-trained weights for gpt2
Number of parameters: 123.65M
A long time ago a lot of things, people are going to "Well. I want to
