# RAG

Build a simple Retrieval-Augmented Generation pipeline to demonstrate its working.

Steps:
1. Document Store: Use in-memory key-value store.
2. Retrieval: Use embeddings from GPT-2
3. Generation: Use GPT-2 for generating a response

In [None]:
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer,BertModel
from bert import BERT
from bert_config import BERTConfig
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# text_1 = "Hello,world"
# text_2 = "Second sentence"
# encoded_input = tokenizer(text_1,text_2,return_tensors="pt")
# print(encoded_input)
# output_hf = model_hf(**encoded_input)
# embedding_output = model_hf.embeddings(encoded_input["input_ids"])
# encoder_output = model_hf.encoder(embedding_output)
# # print(encoder_output)
# print(output_hf)
# print(output_hf.last_hidden_state[:,0,:])

# model = BERT.from_pretrained(config=BERTConfig())
# output = model(**encoded_input)
# print(output)

In [3]:
# Store the embeddings for the dev set
import torch
from torch.utils.data.dataloader import DataLoader
from torch.nn.utils.rnn import pad_sequence
import numpy as np
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from transformers import BertTokenizer,BertModel
from transformers import DataCollatorWithPadding
from bert import BERT
from bert_config import BERTConfig
from rag.snliDataset import snliDataset, snliEmbeddings
from tqdm import tqdm

model = BERT.from_pretrained(config=BERTConfig())
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
device="cuda"
model.to(device)
model.eval()
model_hf = BertModel.from_pretrained("bert-base-uncased")
model_hf.to(device)
model_hf.eval()
def dynamic_padding(data,device="cuda"):
    s1 = [item["sentence1"] for item in data]
    s2 = [item["sentence2"] for item in data]
    labels = [item["label"] for item in data]
    encoded = tokenizer(s1,s2,padding=True,truncation=True,return_tensors="pt",max_length=512)
    encoded["attention_mask"] = encoded["attention_mask"].bool()
    encoded = {key: tensor.to(device) for key, tensor in encoded.items()}
    return encoded,labels


split = "dev"
sd = snliDataset(split)



batch_size = 2
dl = DataLoader(sd,batch_size=batch_size,collate_fn=dynamic_padding)    

embeddings = []

with torch.no_grad():
    for encoded,labels in tqdm(dl):
        
        output = model(**encoded)
        
        output_hf_batch = model_hf(**encoded)
        print(output)
        print("-------------")
        print(torch.mean(output_hf_batch.last_hidden_state,dim=1))
        # embedding_output = model_hf.embeddings(encoded["input_ids"])
        # print(f"HF embedding: {embedding_output[0,0,:]}")
        print(torch.isclose(output,torch.mean(output_hf_batch.last_hidden_state,dim=1)))
        break
        
        # output = output_hf_batch.last_hidden_state.cpu()
        # embeddings.append(torch.mean(output,dim=1))

# embeddings = torch.cat(embeddings)
# torch.save(embeddings, f"embeddings_{split}.pt")



Loading pre-trained weights for BERT


  0%|          | 0/4921 [00:00<?, ?it/s]

tensor([[False,  True,  True,  ...,  True,  True,  True],
        [False, False, False,  ..., False, False, False]], device='cuda:0')





In [None]:
# Build a MLP classifier
from torch import nn
import torch.nn.functional as F
class MLP(nn.Module):
    def __init__(self,input_size,hidden_size,output_size):
        super(MLP, self).__init__()
        self.hidden_layer = nn.Linear(input_size,hidden_size)
        self.output_layer = nn.Linear(hidden_size,output_size)
        self.dropout = nn.Dropout(0.2)
    
    def forward(self,x):
        x = self.dropout(F.relu(self.hidden_layer(x)))
        x = self.output_layer(x)
        return x


In [None]:
# Train loop
import torch
import sys
sys.path.append("/home/varun/projects/experiments-with-gpt2/")

from snliDataset import snliDataset, snliEmbeddings
from torch.utils.data.dataloader import DataLoader

split = "dev"
se_train = snliEmbeddings(split=split)


In [None]:
device = "cuda"
mlp = MLP(768,100,3)
mlp.to(device)
criterion = nn.CrossEntropyLoss()
learning_rate = 1e-4
optimizer = torch.optim.Adam(mlp.parameters(),lr=learning_rate)
n_epochs = 50
train_loader = DataLoader(se_train,batch_size=64,shuffle=True)

for epoch in range(n_epochs):
    mlp.train()
    running_loss = 0.0

    for i,batch in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = mlp(batch["embedding"].to(device))
        loss = criterion(outputs,torch.tensor(batch["label"]).to(device))
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        
    print(f"Epoch [{epoch+1}/{n_epochs}], Loss: {running_loss/len(train_loader):.4f}")



In [None]:
# Evaluation
mlp.eval()
correct, total = 0, 0
se_test = snliEmbeddings(split="test")
test_loader = DataLoader(se_test,batch_size=1)
# test_loader = DataLoader(sd,batch_size=32)
with torch.no_grad():
    for batch in test_loader:
        outputs = mlp(batch["embedding"].to(device))
        _, predicted = torch.max(outputs, 1)
        # print(predicted, batch["label"])
        total += batch["label"].size(0)
        correct += (predicted == batch["label"].to(device)).sum().item()
    
print(f"Test Accuracy: {100 * correct / total:.2f}%")