In [3]:
# from w2v import Word2Vec
# from data import SquadDataset
# from rnn import RNN
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from datasets import load_dataset
from transformers import BertTokenizerFast, BertModel
import time

In [4]:
class SquadDataset(torch.utils.data.Dataset):
	'''
	- Creates batches dynamically by padding to the length of largest example
	  in a given batch.
	- Calulates character vectors for contexts and question.
	- Returns tensors for training.
	'''
	
	def __init__(self, data, batch_size, tokenizer, max_length ):
		
		self.batch_size = batch_size
		data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
		self.data = data
		# self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
		self.tokenizer = tokenizer
		self.max_length = max_length
		
	def __len__(self):
		return len(self.data)
	
	def __iter__(self):
		'''
		Creates batches of data and yields them.
		
		Each yield comprises of:
		:padded_context: padded tensor of contexts for each batch 
		:padded_question: padded tensor of questions for each batch 
		:label: 
		
		'''
		
		for batch in self.data:
			questions = self.tokenizer(batch['question'], max_length = self.max_length, padding='max_length', truncation=True, return_tensors='pt')
			contexts = self.tokenizer(batch['sentence'], max_length = self.max_length, padding='max_length', truncation=True, return_tensors='pt')
			labels = torch.IntTensor(batch['label']).to(torch.int8)
			# question, context include input_ids, attention_mask, token_type_ids
			yield questions['input_ids'], contexts['input_ids'], labels
			
		

In [5]:
class Word2Vec(nn.Module):
	def __init__(self, vocab_size, embed_size, BERT = False): 
		super(Word2Vec, self).__init__()
		if BERT:
			model = BertModel.from_pretrained('bert-base-cased')
			self.embeddings = model.embeddings.word_embeddings
			self.embeddings.requires_grad_(False)
		else:	
			self.embeddings = nn.Embedding(vocab_size, embed_size)
			torch.nn.init.xavier_uniform_(self.embeddings.weight)
	def forward(self, x):
		x = self.embeddings(x)
		return x

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [8]:
dataset = load_dataset("nyu-mll/glue", "qnli")


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]

In [9]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [10]:
random_train = dataset['train'].select(range(2269,12269))
random_val = dataset['validation'].select(range(2269,3269))
random_test = dataset['validation'].select(range(3269,4269))

In [11]:
max_length = 128

In [73]:
train_data = SquadDataset(random_train, 64, tokenizer, max_length)
validation_data = SquadDataset(random_val, 32, tokenizer, max_length)
test_data = SquadDataset(random_test, 32, tokenizer, max_length)

In [24]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RNNe(nn.Module):
	def __init__(self, embed_size, hidden_size, num_layers, c_len):
		super(RNNe, self).__init__()
		self.encq = nn.RNN(embed_size, hidden_size, num_layers,batch_first=True, dropout=0.1, bidirectional=True)
		self.encc = nn.RNN(embed_size, hidden_size, num_layers,batch_first=True, dropout=0.1, bidirectional=True) 
		self.Wmodel = nn.Linear(hidden_size*2*c_len, hidden_size*2)
		torch.nn.init.xavier_uniform_(self.Wmodel.weight)
		self.Wout = nn.Linear(hidden_size*2, 2)
		torch.nn.init.xavier_uniform_(self.Wout.weight)
		self.num_layers = num_layers
		self.hidden_size = hidden_size
		# self.h0 = torch.zeros(2*num_layers, hidden_size*2)
		self.dropout = nn.Dropout(0.1)
	
	def forward(self, c, q):
		c_len = c.size(1)
		bs = c.size(0)
		h0 = torch.zeros(2*self.num_layers, bs, self.hidden_size, device='cuda')
		_ , hid_q = self.encq(q, h0) # hid = [2*num_layer, bs , hidden_size*2]

		encc, _ = self.encc(c, hid_q)  # [bs, c_len, hidden_size*2]
		encc = encc.contiguous().view(-1,c_len*self.hidden_size*2)
		out1 = self.Wmodel(encc) 
# 		out1 = out1.squeeze(-1) # [bs, c_len]
		out2 = self.Wout(self.dropout(out1))
		return out2



In [25]:
vocab_size = tokenizer.vocab_size

In [65]:
class RNNqnli(nn.Module):
	def __init__(self, config):
		super(RNNqnli, self).__init__()
		self.w2v = Word2Vec(config['vocab_size'], config['embed_size'],  config['BERT'])
		self.rnn = RNNe(config['embed_size'], config['hidden_size'], config['num_layers'], config['c_len'])

	
	def forward(self, c, q):
		c = self.w2v(c)
		q = self.w2v(q)
		out = self.rnn(c, q)
		return out

In [76]:
args = {
	'vocab_size': tokenizer.vocab_size,
	'embed_size': 768,
	'hidden_size': 512,
	'num_layers': 4,
	'c_len' : max_length,
	'BERT': True
}

In [77]:
model = RNNqnli(args).to(device)
# model = torch.compile(model)

In [78]:
epochs = 24

In [79]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.008,betas=(0.8, 0.999), eps=1e-07, weight_decay=0.0001)
# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_data)*epochs, eta_min=0.006*0.1, last_epoch=-1, verbose=False)

In [80]:
critereon = nn.CrossEntropyLoss().to(device)

In [81]:
def train(model, optimizer, train_data,  critereon, epochs=10):
	t0 = time.time()
	for i in range(epochs):
		model.train()
		running_loss = 0
		for s, q, l in train_data:
			optimizer.zero_grad()
			s = s.to(device)
			q = q.to(device)
			l = l.long().to(device)
# 			with torch.autocast(device_type=device, dtype=torch.bfloat16):
			output = model(s, q)
			loss = critereon(output, l)
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1)            
			loss.backward()
			optimizer.step()
			running_loss += loss.item()
		print(f'Epoch {i}, Loss: {running_loss/(i+1)}')
	t1 = time.time()
	print(f'Training time: {t1-t0}')

In [82]:
train( model, optimizer,train_data,  critereon, epochs)


Epoch 0, Loss: 93.22959440946579
Epoch 1, Loss: 27.568972945213318
Epoch 2, Loss: 18.283279021581013
Epoch 3, Loss: 13.752624839544296
Epoch 4, Loss: 15.34749892950058
Epoch 5, Loss: 9.155971278746923
Epoch 6, Loss: 7.856041474001748
Epoch 7, Loss: 16.053224004805088
Epoch 8, Loss: 6.106465531720056
Epoch 9, Loss: 5.48189747929573
Epoch 10, Loss: 4.980025269768455
Epoch 11, Loss: 4.564316858847936
Epoch 12, Loss: 4.213229261911833
Epoch 13, Loss: 3.9234616415841237
Epoch 14, Loss: 3.6516493916511537
Epoch 15, Loss: 3.42336680367589
Epoch 16, Loss: 26.128818887121536
Epoch 17, Loss: 31.099978105889427
Epoch 18, Loss: 2.885434323235562
Epoch 19, Loss: 2.7385259926319123
Epoch 20, Loss: 2.6081016602970304
Epoch 21, Loss: 2.489572915163907
Epoch 22, Loss: 2.3813529403313347
Epoch 23, Loss: 2.2821514904499054
Epoch 24, Loss: 2.1908862614631652
Epoch 25, Loss: 2.106641608935136
Epoch 26, Loss: 2.028637301038813
Epoch 27, Loss: 3.531492531299591
Epoch 28, Loss: 2.0791060554570167
Epoch 29, Lo

In [83]:
def evaluation(model, validation_data, critereon):
	model.eval()
	running_loss = 0
	samples = 0
	acc = 0
	for s, q, l in validation_data:
		s = s.to(device)
		q = q.to(device)
		l = l.long().to(device)
		with torch.no_grad():
			output = model(s, q)
			loss = critereon(output, l)
			running_loss += loss.item()
			_, pred = torch.max(output, 1)
			samples += len(l)
			acc += torch.sum(pred == l).item()
	print(f'Validation Loss: {running_loss/len(validation_data)}')
	print(f'Accuracy: {acc/samples}')

In [84]:
evaluation(model,train_data, critereon)

Validation Loss: 0.6931393938728526
Accuracy: 0.5013


In [85]:
evaluation(model, validation_data, critereon)

Validation Loss: 0.6932024955749512
Accuracy: 0.492


In [96]:
def predict(model, data, max_length, tokenizer):
	model.eval()
	s, q = data['sentence'], data['question']
	s = tokenizer(s, max_length = max_length, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)
	q = tokenizer(q, max_length = max_length, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to(device)
	with torch.no_grad():
		output = model(s, q)
		print(output)
		_, pred = torch.max(output, 1)
	return pred

In [91]:
predict(model, dataset['train'][3000:3064], 128, tokenizer)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')

In [92]:
dataset['train'][3000:3064]['label']

[0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1]

In [93]:
predict(model, dataset['train'][3100:3164], 128, tokenizer)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')

In [95]:
predict(model, dataset['train'][4100:4164], 128, tokenizer)

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], device='cuda:0')

In [100]:
predict(model, dataset['train'][4000], 128, tokenizer)

tensor([[0.0447, 0.0539]], device='cuda:0')


tensor([1], device='cuda:0')

In [101]:
torch.save(model.state_dict(), "/kaggle/working/rnnsqnli1.pth")