In [33]:
from datasets import load_dataset
from transformers import BertTokenizerFast, BertModel
# from embed_layer import Word2Vec, ContextualEmbedding
# from e2e import E2E 
# from data import SquadDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import time

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer

class SquadDataset(torch.utils.data.Dataset):
	'''
	- Creates batches dynamically by padding to the length of largest example
	  in a given batch.
	- Calulates character vectors for contexts and question.
	- Returns tensors for training.
	'''
	
	def __init__(self, data, batch_size, tokenizer, max_len):
		
		self.batch_size = batch_size
		data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
		self.data = data
		# self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
		self.tokenizer = tokenizer
		self.max_len = max_len
		
	def __len__(self):
		return len(self.data)
	
	def __iter__(self):
		'''
		Creates batches of data and yields them.
		
		Each yield comprises of:
		:padded_context: padded tensor of contexts for each batch 
		:padded_question: padded tensor of questions for each batch 
		:label: 
		
		'''
		
		for batch in self.data:
			questions = self.tokenizer(batch['question'], max_length = self.max_len, padding='max_length', truncation=True, return_tensors='pt')
			contexts = self.tokenizer(batch['sentence'], max_length = self.max_len, padding='max_length', truncation=True, return_tensors='pt')
			labels = torch.IntTensor(batch['label']).to(torch.int8)
			# question, context include input_ids, attention_mask, token_type_ids
			yield questions['input_ids'], contexts['input_ids'], labels

In [34]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel

class Word2Vec(nn.Module):
	def __init__(self, vocab_size, embed_size, BERT = False): 
		super(Word2Vec, self).__init__()
		if BERT:
			model = BertModel.from_pretrained('bert-base-uncased')
			self.embeddings = model.embeddings.word_embeddings
			self.embeddings.requires_grad_(False)
			self.linear = nn.Linear(768, embed_size)
			torch.nn.init.normal_(self.linear.weight, mean=0, std=0.02)
		else:	
			self.embeddings = nn.Embedding(vocab_size, embed_size)
			torch.nn.init.normal_(self.embeddings.weight, mean=0, std=0.02)
			self.linear = nn.Linear(embed_size, embed_size)
			torch.nn.init.normal_(self.linear.weight, mean=0, std=0.02)
	def forward(self, x):
		x = self.embeddings(x)
		x = self.linear(x)
		return x

# class Highway(nn.Module):
# 	def __init__(self, args):
# 		super(Highway, self).__init__()
# 		self.W_proj = nn.Linear(args.hidden_size*2, args.hidden_size*2)
# 		self.W_gate = nn.Linear(args.hidden_size, args.hidden_size)
	
# 	def forward(self, x):
# 		x_proj = F.relu(self.W_proj(x))
# 		x_gate = F.sigmoid(self.W_gate(x))
# 		x_highway = x_gate * x_proj + (1 - x_gate) * x
# 		return x_highway

class ContextualEmbedding(nn.Module):
	def __init__(self, embed_size, hidden_size):
		super(ContextualEmbedding, self).__init__()
		self.RNN= nn.LSTM(input_size=embed_size,
							hidden_size=hidden_size,
							num_layers=1,
							bidirectional=True,
							batch_first=True,
							dropout=0.1)
	def forward(self, x):
		output, _ = self.RNN(x)
		return output
		

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class E2E(nn.Module):
    def __init__(self, hidden_size, c_len):
        super(E2E, self).__init__()
        '''
        input: [bs, qlen, hidden_size*2], [bs, clen, hidden_size*2]
        1. We need to calculate the similarity matrix between the context and the query 
        2. We need to calculate the attention weights for the context and the query [bs, clen, qlen], and then caculate c2q by multiply [bs, clen, qlen] with [bs, qlen, hidden_size*2] is called c2q
        3. We need to calculate the attention weights for the query and the context [bs, qlen, clen], and then caculate q2c by multiply [bs, qlen, clen] with [bs, clen, hidden_size*2] is called q2c
        4. Then we concat [context, q2c, context*q2c, ] 

        '''
        self.Ws = nn.Linear(hidden_size*6, 1, )
        torch.nn.init.normal_(self.Ws.weight, mean=0, std=0.02)

        self.rnn = nn.LSTM(input_size=hidden_size*8, hidden_size=hidden_size*2, num_layers=1, bidirectional=True, batch_first=True, dropout=0.1)
        self.dropout = nn.Dropout(0.1)

        self.last1 = nn.Linear(hidden_size*4*c_len, hidden_size*2 )
        self.last2 = nn.Linear(hidden_size*2, 2)
        
        torch.nn.init.normal_(self.last1.weight, mean=0, std=0.02)
        torch.nn.init.normal_(self.last2.weight, mean=0, std=0.02)
    def forward(self, c, q):
        '''
        c: [bs, clen, hidden_size*2]
        q: [bs, qlen, hidden_size*2]

        '''
        bs = c.size(0)
        c_len = c.size(1)
        q_len = q.size(1)
        hidden_size = c.size(2)

        _c = c.unsqueeze(2).expand(-1, -1, q_len, -1)
        _q = q.unsqueeze(1).expand(-1, c_len, -1, -1)
        cq = torch.mul(_c,_q)
        input_s = torch.cat([_c,_q,cq], dim=-1) # [bs, clen, qlen, hidden_size*6]

        s = self.Ws(input_s).squeeze(-1) #similarity matrix [bs, clen, qlen] 

        s1 = F.softmax(s, dim=-1)
        c2q = torch.bmm(s1, q) # [bs, clen, hidden_size*2], cco the hieu la ta bieu dien cac word trong context bang to hop attention_score*query
        c2q = self.dropout(c2q) 
        
        #q2c
        s2 = F.softmax(torch.max(s, dim=-1)[0], dim=-1) # [bs, clen]
        s2 = s2.unsqueeze(1).expand(-1, q_len, -1) # [bs, qlen, clen]
        q2c = torch.bmm(s2, c) # [bs, qlen, hidden_size*2]
        q2c = self.dropout(q2c) 

        #querry-aware representation 
        G = torch.cat([c, c2q, torch.mul(c, c2q), torch.mul(c, q2c)], dim=-1)    # [bs, clen, hidden_size*8]
        M, _ = self.rnn(G) # [bs, clen, hidden_size*8]
        M = self.dropout(M) 
        M = M.contiguous().view(bs, -1)
        out1 = self.last1(M) # [bs, hidden_size*4]

        out1 = self.dropout(out1)
        out2 = self.last2(F.gelu(out1)) # [bs, 2]
        return out2



In [51]:

dataset = load_dataset("nyu-mll/glue", "qnli")


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [4]:
104743/32

3273.21875

In [4]:
"avc def".split(" ")

['avc', 'def']

In [13]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')



In [7]:
random_train = dataset['train'].select(range(2269,12269))
random_val = dataset['validation'].select(range(2269,3269))
random_test = dataset['validation'].select(range(3269,4269))

In [8]:
train_data = SquadDataset(random_train, 32, tokenizer, 128)
validation_data = SquadDataset(random_val, 32, tokenizer, 128)
test_data = SquadDataset(random_test, 16, tokenizer, 128)

In [10]:
tokenizer.vocab_size

28996

In [37]:
class BiDAF(nn.Module):
	def __init__(self, vocab_size, embed_size, hidden_size, c_len, BERT=False):
		super(BiDAF, self).__init__()
		self.w2v = Word2Vec(vocab_size, embed_size, BERT) # vocab_size, embed_size
		self.qcontext = ContextualEmbedding(embed_size, hidden_size) # embed_size, hidden_size
		self.ccontext = ContextualEmbedding(embed_size, hidden_size)
		self.e2e = E2E(hidden_size, c_len) # hidden_size, c_len
	
	def forward(self, q, c):
		q = self.w2v(q)
		c = self.w2v(c)
		q = self.qcontext(q)
		c = self.ccontext(c)
		return self.e2e(q, c)

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [38]:
BERT = True 

In [49]:
if BERT:
	model = BiDAF(vocab_size=tokenizer.vocab_size, embed_size=100, hidden_size=100, c_len=64, BERT=True)
	# model.to(device)
# 	model = torch.compile(model)
else:
	model = BiDAF(vocab_size=tokenizer.vocab_size, embed_size=128, hidden_size=256, c_len=128)
	model.to(device)
# 	model = torch.compile(model)

In [50]:
total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'Total number of trainable parameters: {total_trainable_params}')

Total number of trainable parameters: 7124503


In [21]:
optimizer = torch.optim.Adadelta(model.parameters(), lr = 0.5, weight_decay=0.0001)

In [None]:
scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=5e-5, steps_per_epoch=3274, epochs=2, anneal_strategy='cos')

In [None]:
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=3274, eta_min=1e-6)

In [22]:
critereon = nn.CrossEntropyLoss().to('cuda')

In [54]:
a = torch.tensor([1, 0, 1])
b = torch.tensor([1, 11, 1])
(a==b).sum().item()

2

In [24]:
import time

In [25]:
def train(model, train_data, optimizer, critereon, scheduler, epochs=1):
	start = time.time()
	for epoch in range(epochs):
		model.train()
		running_loss = 0.0
		for questions, contexts, labels in train_data:
			optimizer.zero_grad(set_to_none= True)
			questions = questions.to(device)            
			contexts = contexts.to(device)
			labels = labels.long().to(device)
			# with torch.autocast(device_type=device, dtype=torch.bfloat16):
			output = model(questions, contexts)
			loss = critereon(output, labels)
			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
			optimizer.step()
			scheduler.step()
			running_loss += loss.item()
		print(f"Epoch: {epoch}, Loss: {running_loss/len(train_data)}")
	end = time.time()
	print(f"Training time: {end-start}")

In [26]:
train(model, train_data, optimizer, critereon, epochs=32)

Epoch: 0, Loss: 0.6946736006691052
Epoch: 1, Loss: 0.6942744904432815
Epoch: 2, Loss: 0.6941428245446933
Epoch: 3, Loss: 0.6936850903894954
Epoch: 4, Loss: 0.6936270804070055
Epoch: 5, Loss: 0.6939113915157014
Epoch: 6, Loss: 0.6936165619962893
Epoch: 7, Loss: 0.6929599953154786
Epoch: 8, Loss: 0.684400350902789
Epoch: 9, Loss: 0.6681230144378857
Epoch: 10, Loss: 0.6299331245330957
Epoch: 11, Loss: 0.5830079354702855
Epoch: 12, Loss: 0.4847485973193242
Epoch: 13, Loss: 0.40153783971604445
Epoch: 14, Loss: 0.3130649346858263
Epoch: 15, Loss: 0.2463168220478482
Epoch: 16, Loss: 0.2161488656621105
Epoch: 17, Loss: 0.2234264256435628
Epoch: 18, Loss: 0.2960725605678254
Epoch: 19, Loss: 0.35526410814005727
Epoch: 20, Loss: 0.3124506064041997
Epoch: 21, Loss: 0.2103419107163307
Epoch: 22, Loss: 0.19358109257901057
Epoch: 23, Loss: 0.13845096823176184
Epoch: 24, Loss: 0.15889592708115235
Epoch: 25, Loss: 0.1566587250832968
Epoch: 26, Loss: 0.10692773797190763
Epoch: 27, Loss: 0.10748318614610

In [27]:
torch.save(model.state_dict(), "/kaggle/working/bidaf32.pth")

In [None]:
torch.cuda.empty_cache()

In [30]:
def evaluation(model, val_data, critereon):
	model.eval()
	running_loss = 0.0
	total = 0
	correct = 0
	# with torch.no_grad():
	for questions, contexts, labels in val_data:
			questions = questions.to(device)
			contexts = contexts.to(device)
			labels = labels.long().to(device)
			with torch.no_grad():
				# with torch.autocast(device_type=device, dtype=torch.float16):
				output = model(questions, contexts)
				loss = critereon(output.view(-1, 2), labels.view(-1))
				running_loss += loss.item()
				_, predicted = torch.max(output, 1)
				total += labels.size(0)
				correct += (predicted == labels).sum().item()
	print(f"Validation Loss: {running_loss/len(val_data)}")
	print(f"Accuracy: {100*correct/total}")

In [31]:
evaluation(model, validation_data, critereon)

Validation Loss: 2.807593956589699
Accuracy: 50.5


In [32]:
evaluation(model, train_data, critereon)

Validation Loss: 1.6514337478163161
Accuracy: 66.71


In [41]:
torch.cuda.empty_cache()

In [40]:
evaluation(model, test_data, critereon)

OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacty of 15.89 GiB of which 510.12 MiB is free. Process 2115 has 15.39 GiB memory in use. Of the allocated memory 14.04 GiB is allocated by PyTorch, and 1.04 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [34]:
def accuracy(model, val_data):
	model.eval()
	correct = 0
	total = 0
	# with torch.no_grad():
	for questions, contexts, labels in val_data:
			questions = questions.to(device)
			contexts = contexts.to(device)
			labels = labels.long().to(device)
			with torch.no_grad():
				# with torch.autocast(device_type=device, dtype=torch.bfloat16):
				output = model(questions, contexts)
				_, predicted = torch.max(output, 1)
				total += labels.size(0)
				correct += (predicted == labels).sum().item()
	print(f"Accuracy: {100*correct/total}")

In [23]:
accuracy(model, train_data)

Accuracy: 51.65


In [24]:
accuracy(model, validation_data)

Accuracy: 47.9


In [43]:
accuracy(model, test_data)

OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB. GPU 0 has a total capacty of 15.89 GiB of which 510.12 MiB is free. Process 2115 has 15.39 GiB memory in use. Of the allocated memory 14.55 GiB is allocated by PyTorch, and 540.70 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [33]:
tokenizer(dataset['train'][2269:2300]['question'], max_length =32, padding='max_length', truncation=True, return_tensors='pt').to('cuda')

{'input_ids': tensor([[  101,  5979,  2360,  3648,  1174,  1113,   170,  1686,  1683,  1104,
          2454,   112,   188,  1109,  3237, 14303,  3414,  1212,   136,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101, 23963,  1233,   179, 11470, 23085,  1320,  1400,  2017,  1107,
          1402,  1390,  1170,  4510,  2133,  1390,   136,   102,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  1327, 10209,  1521,  1106,  1103,  9711,  1104, 10579,   112,
           188, 14781,   136,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0],
        [  101,  5979, 10939,  1107,  1103,  1646,  1105,  2855,  1253, 12912,
          1103,  8832,  1947,  1111,  2029, 15417,   136,   102,     0,     0,
             0,     0,     0,     0,     0,  

In [45]:
def predict(model, questions, contexts,labels, tokenizer, max_len):
    questions = tokenizer(questions, max_length = max_len, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to('cuda')
    contexts = tokenizer(contexts, max_length = max_len, padding='max_length', truncation=True, return_tensors='pt')['input_ids'].to('cuda')
    with torch.no_grad():
        output = model(questions, contexts)
        _, predicted = torch.max(output, 1)
    print("predicted label :", predicted)
    print("actual label :", labels)

In [47]:
predict(model, dataset['train'][2269:2300]['question'], dataset['train'][2269:2300]['sentence'],dataset['train'][2269:2300]['label'], tokenizer, 128)

predicted label : tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1,
        1, 1, 0, 0, 1, 1, 0], device='cuda:0')
actual label : [1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0]


In [44]:
dataset['train'][2269:2300]['label']

[1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0]

In [51]:
torch.cuda.empty_cache()

In [52]:
predict(model, dataset['train'][2600:(2600+50)]['question'], dataset['train'][2600:2650]['sentence'],dataset['train'][2600:2650]['label'], tokenizer, 128)

OutOfMemoryError: CUDA out of memory. Tried to allocate 4.69 GiB. GPU 0 has a total capacty of 15.89 GiB of which 598.12 MiB is free. Process 2110 has 15.31 GiB memory in use. Of the allocated memory 14.72 GiB is allocated by PyTorch, and 283.28 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF