# RAG

Build a simple Retrieval-Augmented Generation pipeline to demonstrate its working.

Steps:
1. Document Store: Use in-memory key-value store.
2. Retrieval: Use embeddings from GPT-2
3. Generation: Use GPT-2 for generating a response

In [7]:
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer,BertModel
from bert import BERT
from bert_config import BERTConfig
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_hf = BertModel.from_pretrained("bert-base-uncased")
text_1 = "Hello,world"
text_2 = "Second sentence"
encoded_input = tokenizer(text_1,text_2,return_tensors="pt")
print(encoded_input)
output_hf = model_hf(**encoded_input)
embedding_output = model_hf.embeddings(encoded_input["input_ids"])
encoder_output = model_hf.encoder(embedding_output)
# print(encoder_output)
# print(output_hf)
print(output_hf.last_hidden_state[:,0,:])

model = BERT.from_pretrained(config=BERTConfig())
output = model(**encoded_input)
print(output)

{'input_ids': tensor([[ 101, 7592, 1010, 2088,  102, 2117, 6251,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[-4.1294e-01, -7.1324e-02,  1.3854e-01, -1.6372e-01,  1.9077e-02,
          4.8927e-02,  4.5613e-01,  3.8596e-01, -5.4269e-01, -5.7494e-01,
         -1.9105e-01,  1.0720e-01,  2.0602e-01,  5.7859e-01,  3.9898e-02,
         -3.0058e-02,  1.6359e-02,  4.8764e-01,  1.8919e-01, -4.8697e-01,
          3.2901e-01,  4.0074e-02, -1.2393e-03, -5.2827e-02,  6.9944e-02,
         -1.0699e-02, -4.7424e-01,  1.7889e-01, -3.4116e-01, -1.2349e-01,
         -1.7821e-01, -9.2627e-02,  1.0770e-01,  2.7475e-01,  2.9903e-01,
         -4.1094e-01, -2.5761e-01,  5.3917e-02, -1.6200e-01,  2.0917e-01,
          1.5174e-01, -1.5715e-01,  6.5028e-02, -1.6512e-01,  4.2797e-02,
         -2.3322e-03, -2.5327e+00, -2.0062e-01, -2.0991e-01,  3.8807e-02,
         -5.7647e-02, -2.8008e-01,  1.5989e-01,  6.6906e-02, -9.6201e-02,
    

In [1]:
# Store the embeddings for the dev set
import torch
from torch.utils.data.dataloader import DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer,BertModel
from bert import BERT
from bert_config import BERTConfig
from rag.snliDataset import snliDataset
from tqdm import tqdm

model = BERT.from_pretrained(config=BERTConfig())
device="cuda"
model.to(device)
def dynamic_padding(data):
    input_ids = [item["input_ids"][0] for item in data]
    attention_masks = [item["attention_mask"][0] for item in data]
    token_type_ids = [item["token_type_ids"][0] for item in data]
    inputs_padded = pad_sequence(input_ids,batch_first=True,padding_value=0)
    attention_masks_padded = pad_sequence(attention_masks,batch_first=True,padding_value=0)
    attention_masks_padded = torch.tensor(attention_masks_padded,dtype=torch.bool)
    token_type_ids_padded = pad_sequence(token_type_ids,batch_first=True,padding_value=0)
    return {"input_ids": inputs_padded, "attention_masks": attention_masks_padded,"token_type_ids": token_type_ids_padded}

sd = snliDataset("train")
batch_size = 8
dl = DataLoader(sd,batch_size=batch_size,collate_fn=dynamic_padding)
embeddings = []
with torch.no_grad():
    for i, batch in tqdm(enumerate(dl)):
        output = model(batch["input_ids"].to(device),
                    batch["token_type_ids"].to(device),
                    batch["attention_masks"].to(device))
        output = output.cpu()
        embeddings.append(output)

embeddings = torch.cat(embeddings,dim=0)
torch.save(embeddings, "embeddings_train.pt")



  from .autonotebook import tqdm as notebook_tqdm


Loading pre-trained weights for BERT


  attention_masks_padded = torch.tensor(attention_masks_padded,dtype=torch.bool)
68671it [21:56, 52.16it/s]


In [22]:
# Build a MLP classifier
from torch import nn
import torch.nn.functional as F
class MLP(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_size,hidden_size)
        self.output_layer = nn.Linear(hidden_size,output_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self,x):
        x = self.dropout(F.relu(self.hidden_layer(x)))
        x = self.output_layer(x)
        return x


In [20]:
# Train loop
import torch
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from snliDataset import snliDataset
from torch.utils.data.dataloader import DataLoader


sd = snliDataset(split="train")


  embeddings_mat = torch.load(f"embeddings_{split}.pt")
  loss = criterion(outputs,torch.tensor(batch["label"]).to(device))


Epoch [1/10], Loss: 0.7873
Epoch [2/10], Loss: 0.7325
Epoch [3/10], Loss: 0.7154
Epoch [4/10], Loss: 0.7044
Epoch [5/10], Loss: 0.6944
Epoch [6/10], Loss: 0.6878
Epoch [7/10], Loss: 0.6814
Epoch [8/10], Loss: 0.6765
Epoch [9/10], Loss: 0.6715
Epoch [10/10], Loss: 0.6678


In [None]:
device = "cuda"
mlp = MLP(768,100,3)
mlp.to(device)
criterion = nn.CrossEntropyLoss()
learning_rate = 1e-4
optimizer = torch.optim.Adam(mlp.parameters(),lr=learning_rate)
n_epochs = 20
train_loader = DataLoader(sd,batch_size=32)

for epoch in range(n_epochs):
    mlp.train()
    running_loss = 0.0

    for i,batch in enumerate(train_loader):
        outputs = mlp(batch["embedding"].to(device))
        loss = criterion(outputs,torch.tensor(batch["label"]).to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}")



In [21]:
# Evaluation
mlp.eval()
correct, total = 0, 0
sd = snliDataset(split="test")
test_loader = DataLoader(sd,batch_size=32)
with torch.no_grad():
    for batch in test_loader:
        outputs = mlp(batch["embedding"].to(device))
        _, predicted = torch.max(outputs, 1)
        # print(predicted, batch["label"])
        total += batch["label"].size(0)
        correct += (predicted == batch["label"].to(device)).sum().item()

print(f"Test Accuracy: {100 * correct / total:.2f}%")

Test Accuracy: 73.19%


In [18]:
emb_mat = torch.load("embeddings_dev.pt")
print(emb_mat.size())
print(emb_mat[0])

torch.Size([9842, 768])
tensor([-8.3870e-02, -2.0120e-01,  2.8359e-01,  8.3558e-02,  1.5864e-01,
        -3.3191e-01,  1.8555e-02,  3.4614e-01, -1.2018e-01, -5.8454e-02,
         1.5888e-01, -7.4928e-02, -1.1119e-01,  2.8032e-01, -4.3628e-02,
         1.0782e-01, -2.3436e-01,  2.5837e-01,  5.0681e-02,  2.0824e-01,
         2.3817e-01, -2.7823e-01, -1.1665e-01,  1.1745e-01,  1.8096e-01,
        -1.6913e-01, -1.4109e-01,  2.0160e-02,  7.0707e-02, -2.0918e-01,
         2.5349e-01,  1.3314e-01, -5.6560e-02, -1.0323e-01,  1.1722e-01,
        -5.7964e-03,  1.4605e-02, -5.4307e-02, -6.8620e-02,  1.0881e-01,
        -3.0531e-01, -3.0762e-01,  9.1356e-02,  1.3269e-01, -1.8626e-01,
        -4.8844e-01,  4.4068e-01,  1.9068e-02, -7.5594e-02,  1.2931e-01,
        -1.0715e-01,  1.6343e-01,  3.5730e-02,  2.6090e-03,  2.4221e-01,
         5.0100e-01, -3.6525e-01, -1.7202e-01, -1.9746e-01, -2.0829e-01,
         7.8090e-02, -3.6817e-02,  2.4409e-01, -3.0138e-01, -1.5168e-02,
         9.8411e-02, -1.299

  emb_mat = torch.load("embeddings_dev.pt")
