# RAG

Build a simple Retrieval-Augmented Generation pipeline to demonstrate its working.

Steps:
1. Document Store: Use in-memory key-value store.
2. Retrieval: Use embeddings from GPT-2
3. Generation: Use GPT-2 for generating a response

In [None]:
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer,BertModel
from bert import BERT
from bert_config import BERTConfig
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# text_1 = "Hello,world"
# text_2 = "Second sentence"
# encoded_input = tokenizer(text_1,text_2,return_tensors="pt")
# print(encoded_input)
# output_hf = model_hf(**encoded_input)
# embedding_output = model_hf.embeddings(encoded_input["input_ids"])
# encoder_output = model_hf.encoder(embedding_output)
# # print(encoder_output)
# print(output_hf)
# print(output_hf.last_hidden_state[:,0,:])

# model = BERT.from_pretrained(config=BERTConfig())
# output = model(**encoded_input)
# print(output)

In [6]:
# Store the embeddings for the dev set
import torch
from torch.utils.data.dataloader import DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer,BertModel
from transformers import DataCollatorWithPadding
from bert import BERT
from bert_config import BERTConfig
from rag.snliDataset import snliDataset, snliEmbeddings
from tqdm import tqdm

model = BERT.from_pretrained(config=BERTConfig())
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device="cuda"
model.to(device)
model.eval()
# model_hf = BertModel.from_pretrained("bert-base-uncased")
# model_hf.to(device)
# model_hf.eval()
def dynamic_padding(data,device="cuda"):
    s1 = [item["sentence1"] for item in data]
    s2 = [item["sentence2"] for item in data]
    labels = [item["label"] for item in data]
    encoded = tokenizer(s1,s2,padding=True,truncation=True,return_tensors="pt",max_length=512)
    encoded["attention_mask"] = encoded["attention_mask"].bool()
    encoded["sentence1"] = s1
    encoded["sentece2"] = s2
    encoded = {key: tensor.to(device) for key, tensor in encoded.items()}
    return encoded,labels

def prepare_data(output_filename:str):
    """
    Store the training data in the following as a json file. Format:
    {
        'sentence1': [ids for sentence1],
        'sentence2': [ids for sentence2],
        'embedding': [output of bert for the sentence1, sentence2 input],
        'label': int (0,1,2)
    }
    """
    split = "dev"
    sd = snliDataset(split)
    batch_size = 64
    dl = DataLoader(sd,batch_size=batch_size,collate_fn=dynamic_padding)    

    data = []
    with torch.no_grad():
        for encoded,labels in tqdm(dl):
            
            output = model(**encoded)
            output = output.cpu()
            embedding = torch.mean(output,dim=1)
            for i in range(batch_size):
                data_item = {
                    "sentence1": encoded[i]["sentence1"].cpu(),
                    "sentece2": encoded[i]["sentence2"].cpu(),
                    "embedding": embedding[i]
                    "label": labels[i]
                }
    embeddings = torch.cat(embeddings)
    torch.save(embeddings, f"embeddings_{split}.pt")





Loading pre-trained weights for BERT


100%|██████████| 42259/42259 [22:46<00:00, 30.92it/s]


In [5]:
t = torch.load("embeddings_train.pt")
print(t.size())

  t = torch.load("embeddings_train.pt")


torch.Size([549367, 768])


In [7]:
# Build a MLP classifier
from torch import nn
import torch.nn.functional as F
class MLP(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_size,hidden_size)
        self.output_layer = nn.Linear(hidden_size,output_size)
        self.dropout = nn.Dropout(0.0)
    
    def forward(self,x):
        x = self.dropout(F.relu(self.hidden_layer(x)))
        x = self.output_layer(x)
        return x


In [9]:
# Train loop
import torch
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from snliDataset import snliDataset, snliEmbeddings
from torch.utils.data.dataloader import DataLoader

split = "train"
se_train = snliEmbeddings(split=split)


In [10]:
device = "cuda"
mlp = MLP(768,100,3)
mlp.to(device)
criterion = nn.CrossEntropyLoss()
learning_rate = 1e-4
optimizer = torch.optim.Adam(mlp.parameters(),lr=learning_rate)
n_epochs = 50
train_loader = DataLoader(se_train,batch_size=13)

for epoch in range(n_epochs):
    mlp.train()
    running_loss = 0.0

    for i,batch in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = mlp(batch["embedding"].to(device))
        loss = criterion(outputs,torch.tensor(batch["label"]).to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}")



  loss = criterion(outputs,torch.tensor(batch["label"]).to(device))


Epoch [1/50], Loss: 0.8239
Epoch [2/50], Loss: 0.7628
Epoch [3/50], Loss: 0.7382
Epoch [4/50], Loss: 0.7217
Epoch [5/50], Loss: 0.7093
Epoch [6/50], Loss: 0.6993
Epoch [7/50], Loss: 0.6911
Epoch [8/50], Loss: 0.6841
Epoch [9/50], Loss: 0.6781
Epoch [10/50], Loss: 0.6728
Epoch [11/50], Loss: 0.6682
Epoch [12/50], Loss: 0.6641
Epoch [13/50], Loss: 0.6603
Epoch [14/50], Loss: 0.6569
Epoch [15/50], Loss: 0.6538
Epoch [16/50], Loss: 0.6508
Epoch [17/50], Loss: 0.6480
Epoch [18/50], Loss: 0.6455
Epoch [19/50], Loss: 0.6431
Epoch [20/50], Loss: 0.6408
Epoch [21/50], Loss: 0.6387
Epoch [22/50], Loss: 0.6367
Epoch [23/50], Loss: 0.6346
Epoch [24/50], Loss: 0.6327
Epoch [25/50], Loss: 0.6310
Epoch [26/50], Loss: 0.6292
Epoch [27/50], Loss: 0.6276
Epoch [28/50], Loss: 0.6261
Epoch [29/50], Loss: 0.6246
Epoch [30/50], Loss: 0.6232
Epoch [31/50], Loss: 0.6218
Epoch [32/50], Loss: 0.6205
Epoch [33/50], Loss: 0.6193
Epoch [34/50], Loss: 0.6181
Epoch [35/50], Loss: 0.6168
Epoch [36/50], Loss: 0.6157
E

In [15]:
# Evaluation
mlp.eval()
correct, total = 0, 0
se_test = snliEmbeddings(split="test")
test_loader = DataLoader(se_test,batch_size=32)
with torch.no_grad():
    for batch in test_loader:
        outputs = mlp(batch["embedding"].to(device))
        _, predicted = torch.max(outputs, 1)
        # print(predicted, batch["label"])
        total += batch["label"].size(0)
        correct += (predicted == batch["label"].to(device)).sum().item()
    
print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 71.78%
