In [2]:
# from w2v import Word2Vec
# from data import SquadDataset
# from rnn import RNN
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_dataset
from transformers import BertTokenizerFast, BertModel
import time

In [3]:
class SquadDataset(torch.utils.data.Dataset):
	'''
	- Creates batches dynamically by padding to the length of largest example
	  in a given batch.
	- Calulates character vectors for contexts and question.
	- Returns tensors for training.
	'''
	
	def __init__(self, data, batch_size, tokenizer, max_length ):
		
		self.batch_size = batch_size
		data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
		self.data = data
		# self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
		self.tokenizer = tokenizer
		self.max_length = max_length
		
	def __len__(self):
		return len(self.data)
	
	def __iter__(self):
		'''
		Creates batches of data and yields them.
		
		Each yield comprises of:
		:padded_context: padded tensor of contexts for each batch 
		:padded_question: padded tensor of questions for each batch 
		:label: 
		
		'''
		
		for batch in self.data:
			questions = self.tokenizer(batch['question'], max_length = self.max_length, padding='max_length', truncation=True, return_tensors='pt')
			contexts = self.tokenizer(batch['sentence'], max_length = self.max_length, padding='max_length', truncation=True, return_tensors='pt')
			labels = torch.IntTensor(batch['label']).to(torch.int8)
			# question, context include input_ids, attention_mask, token_type_ids
			yield questions['input_ids'], contexts['input_ids'], labels
			
		

In [4]:
class Word2Vec(nn.Module):
	def __init__(self, vocab_size, embed_size, BERT = False): 
		super(Word2Vec, self).__init__()
		if BERT:
			model = BertModel.from_pretrained('bert-base-cased')
			self.embeddings = model.embeddings.word_embeddings
			self.embeddings.requires_grad_(False)
		else:	
			self.embeddings = nn.Embedding(vocab_size, embed_size)
			torch.nn.init.xavier_uniform_(self.embeddings.weight)
	def forward(self, x):
		x = self.embeddings(x)
		return x

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNe(nn.Module):
	def __init__(self, embed_size, hidden_size, num_layers, c_len):
		super(RNNe, self).__init__()
		self.encq = nn.RNN(embed_size, hidden_size, num_layers,batch_first=True, dropout=0.2, bidirectional=True)
		self.encc = nn.RNN(embed_size, hidden_size, num_layers,batch_first=True, dropout=0.2, bidirectional=True) 
		self.Wmodel = nn.Linear(hidden_size*2, 1)
		torch.nn.init.xavier_uniform_(self.Wmodel.weight)
		self.Wout = nn.Linear(c_len, 2)
		torch.nn.init.xavier_uniform_(self.Wout.weight)
		self.num_layers = num_layers
		self.hidden_size = hidden_size
		# self.h0 = torch.zeros(2*num_layers, hidden_size*2)

	
	def forward(self, c, q):
		c_len = c.size(1)
		bs = c.size(0)
		h0 = torch.zeros(2*self.num_layers, bs, self.hidden_size, device='cuda')
		_ , hid_q = self.encq(q, h0) # hid = [2*num_layer, bs , hidden_size*2]

		encc, _ = self.encc(c, hid_q)  # [bs, c_len, hidden_size*2]

		out1 = self.Wmodel(encc) # [bs, c_len, 1] 
		out1 = out1.squeeze(-1) # [bs, c_len]
		out2 = self.Wout(F.relu(out1))
		return out2


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
dataset = load_dataset("nyu-mll/glue", "qnli")


In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [8]:
random_train = dataset['train'].select(range(2269,12269))
random_val = dataset['validation'].select(range(2269,3269))
random_test = dataset['validation'].select(range(3269,4269))

In [None]:
max_length = 128

In [9]:
train_data = SquadDataset(random_train, 32, tokenizer, max_length)
validation_data = SquadDataset(random_val, 32, tokenizer, max_length)
test_data = SquadDataset(random_test, 32, tokenizer, max_length)

In [27]:
class RNNqnli(nn.Module):
	def __init__(self, config):
		super(RNNqnli, self).__init__()
		self.w2v = Word2Vec(config['vocab_size'], config['embed_size'],  config['BERT'])
		self.rnn = RNNe(config['embed_size'], config['hidden_size'], config['num_layers'], config['c_len'])

	
	def forward(self, c, q):
		c = self.w2v(c)
		q = self.w2v(q)
		out = self.rnn(c, q)
		return out

In [11]:
vocab_size = tokenizer.vocab_size

In [21]:
args = {
	'vocab_size': tokenizer.vocab_size,
	'embed_size': 128,
	'hidden_size': 256,
	'num_layers': 2,
	'c_len' : max_length,
	'BERT': True
}

In [28]:
model = RNNqnli(args).to(device)
# model = torch.compile(model)

In [29]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.008,betas=(0.8, 0.999), eps=1e-07, weight_decay=0.0001)


In [30]:
critereon = nn.CrossEntropyLoss().to(device)

In [31]:
def train(model, optimizer, critereon, epochs=10):
	t0 = time.time()
	for i in range(epochs):
		model.train()
		running_loss = 0
		for s, q, l in train_data:
			optimizer.zero_grad()
			s = s.to(device)
			q = q.to(device)
			l = l.long().to(device)
# 			with torch.autocast(device_type=device, dtype=torch.bfloat16):
			output = model(s, q)
			loss = critereon(output, l)
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1)            
			loss.backward()
			optimizer.step()
			running_loss += loss.item()
		print(f'Epoch {i}, Loss: {running_loss/(i+1)}')
	t1 = time.time()
	print(f'Training time: {t1-t0}')

In [32]:
train( model, optimizer, critereon, 10)


Epoch 0, Loss: 0.7004600441133654
Epoch 1, Loss: 0.6927790110175674
Epoch 2, Loss: 0.6905552889849689
Epoch 3, Loss: 0.6842885726207012
Epoch 4, Loss: 0.6798794688405218
Epoch 5, Loss: 0.6697727264584722
Epoch 6, Loss: 0.6584697736276163
Epoch 7, Loss: 0.6478120249670904
Epoch 8, Loss: 0.6333841520386774
Epoch 9, Loss: 0.6183131111634744


In [None]:
def evaluation(model, validation_data, critereon):
	model.eval()
	running_loss = 0
	samples = 0
	acc = 0
	for s, q, l in validation_data:
		s = s.to(device)
		q = q.to(device)
		l = l.long().to(device)
		with torch.no_grad():
			output = model(s, q)
			loss = critereon(output, l)
			running_loss += loss.item()
			_, pred = torch.max(output, 1)
			samples += len(l)
			acc += torch.sum(pred == l).item()
	print(f'Validation Loss: {running_loss/len(validation_data)}')
	print(f'Accuracy: {acc/samples}')

In [None]:
evaluation(model, critereon)

In [1]:
import torch

In [3]:
torch.arange(0, 10)[1::2]

tensor([1, 3, 5, 7, 9])

In [None]:
def accuray(model):
	model.eval()
	running_loss = 0
	acc = 0
	for s, q, l in test_data:
		s = s.to(device)
		q = q.to(device)
		l = l.long().to(device)
		output = model(s, q)
		_, pred = torch.max(output, 1)
		acc += torch.sum(pred == l).item()
	print(f'Accuracy: {acc/len(test_data)}')

In [None]:
def predict(model, data, max_length, tokenizer):
	model.eval()
	s, q = data['sentence'], data['question']
	s = tokenizer(s, max_length = max_length, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)
	q = tokenizer(q, max_length = max_length, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)
	with torch.no_grad()
		output = model(s, q)
		_, pred = torch.max(output, 1)
	return pred.item()