In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# from data import SquadDataset
# from model import * 
# from w2v import Word2Vec
from transformers import BertTokenizerFast, BertModel
from datasets import load_dataset
import math
import torch.optim as optim
import time



In [1]:
import torch
torch.cuda.is_available()

True

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer

class SquadDataset(torch.utils.data.Dataset):
	'''
	- Creates batches dynamically by padding to the length of largest example
	  in a given batch.
	- Calulates character vectors for contexts and question.
	- Returns tensors for training.
	'''
	
	def __init__(self, data, batch_size, tokenizer):
		
		self.batch_size = batch_size
		data = [data[i:i+self.batch_size] for i in range(0, len(data), self.batch_size)]
		self.data = data
		# self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
		self.tokenizer = tokenizer
		
		
	def __len__(self):
		return len(self.data)
	
	def __iter__(self):
		'''
		Creates batches of data and yields them.
		
		Each yield comprises of:
		:padded_context: padded tensor of contexts for each batch 
		:padded_question: padded tensor of questions for each batch 
		:label: 
		
		'''
		
		for batch in self.data:
			questions = self.tokenizer(batch['question'], max_length = 96, padding='max_length', truncation=True, return_tensors='pt')
			contexts = self.tokenizer(batch['sentence'], max_length = 96, padding='max_length', truncation=True, return_tensors='pt')
			labels = torch.IntTensor(batch['label']).to(torch.int8)
			# question, context include input_ids, attention_mask, token_type_ids
			yield questions['input_ids'], contexts['input_ids'], labels
			
			

In [3]:
class Word2Vec(nn.Module):
	def __init__(self, vocab_size, embed_size, BERT = False): 
		super(Word2Vec, self).__init__()
		if BERT:
			model = BertModel.from_pretrained('bert-base-uncased')
			self.embeddings = model.embeddings.word_embeddings
			self.embeddings.requires_grad_(False)
		else:	
			self.embeddings = nn.Embedding(vocab_size, embed_size)
			torch.nn.init.xavier_uniform_(self.embeddings.weight)
	def forward(self, x):
		x = self.embeddings(x)
		return x
	


class PositionalEmbedding(nn.Module):
	def __init__(self,embed_size, max_len, device):
		super(PositionalEmbedding, self).__init__()
		self.encoding = torch.zeros(max_len, embed_size, requires_grad=False, device=device)
		pos = torch.arange(0, max_len, device=device).float().unsqueeze(1)
		_2i = torch.arange(0, embed_size, 2, device=device).float()
		self.encoding[:, 0::2] = torch.sin(pos/ torch.pow(10000, _2i/ embed_size)).to(device)
		self.encoding[:, 1::2] = torch.cos(pos/ torch.pow(10000, _2i/ embed_size)).to(device)

	def forward(self, x):
		# bs, seqlen, embed_dim = x.size()
		# pe_tensor = torch.zeros(seqlen, embed_dim)
		# sin = [torch.sin(pos/ torch.pow(10000, torch.arange(0, embed_dim, 2)/ embed_dim)) for  pos in self.pos]
		# cos = [torch.cos(pos/ torch.pow(10000, torch.arange(1, embed_dim, 2)/ embed_dim)) for pos in self.pos]
		# pe_tensor[:, 0::2] = sin
		# pe_tensor[:, 1::2] = cos
		# pe_tensor = pe_tensor.unsqueeze(0).expand(bs, seqlen, embed_dim)
		bs, seqlen, embed_dim = x.size()
		return self.encoding[:seqlen, :].expand(bs, seqlen, embed_dim)

class WordEmbedding(nn.Module):
	def __init__(self, vocab_size, embed_size, max_len, device, BERT=False):
		super(WordEmbedding, self).__init__()
		self.word2vec = Word2Vec(vocab_size, embed_size, BERT)
		self.positional_embedding = PositionalEmbedding( embed_size, max_len, device)
	
	def forward(self, x):
		x = self.word2vec(x)
		x = x + self.positional_embedding(x)
		return x

In [4]:
class LayerNorm(nn.Module):
    def __init__(self, ndim, bias: bool = True, eps: float = 1e-5):
        super(LayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(ndim))
        self.beta = nn.Parameter(torch.zeros(ndim))
        self.eps = eps
    
    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.gamma * (x - mean) / (std + self.eps) + self.beta

class SelfAttention(nn.Module):
    def __init__(self, embed_size, nhead, dropout):
        super(SelfAttention, self).__init__()
        self.dropout = nn.Dropout(dropout)
        self.p_qkv = nn.Linear(embed_size, embed_size*3)
        torch.nn.init.xavier_uniform_(self.p_qkv.weight)

        self.p_proj = nn.Linear(embed_size, embed_size)
        torch.nn.init.xavier_uniform_(self.p_proj.weight)

        self.nhead = nhead
    
    def forward(self, x , attmask):   
        '''
        1. input q, k, v, attention mask x [bs, nhead, seq_len, embed_size], attmask [bs, seqlen]
        2. calculate the attention score
        3. add & norm ( dropout residual connection before add )
        4. feed forward network
        5. add & norm ( dropout residual connection before add )
        ensure that output have shape [bs, seqlen, embed_size*n_head]

        '''

        x = self.p_qkv(x) # [bs, seq_len, embed_size*3]
        q, k, v = torch.chunk(x, 3, dim = -1) # q, k, v [bs, seq_len, embed_size]
        bs, sqlen, embed_size = q.size()

        q = q.view(bs, sqlen, self.nhead, embed_size//self.nhead).transpose(1, 2)
        k = k.view(bs, sqlen, self.nhead, embed_size//self.nhead).transpose(1, 2)
        v = v.view(bs, sqlen, self.nhead, embed_size//self.nhead).transpose(1, 2)

        # configure mask
        attmask = attmask.unsqueeze(1).unsqueeze(2) # [bs, 1, 1, seq_len]


        att_score = (q @ k.transpose(-2, -1)) * (1.0/math.sqrt(q.size(-1))) # [bs, nhead, seq_len, seq_len]
        att_score = att_score.masked_fill(attmask == 0, -10000)
        att_score = F.softmax(att_score, dim = -1)
        att_score = self.dropout(att_score) # [bs, nhead, seq_len, seq_len]
        y = att_score @ v # [bs, nhead, seqlen, embed_size//nhead]
        y = y.transpose(1, 2).contiguous().view(bs, sqlen, embed_size)
        
        # is y need to be go through a linear layer?
        y = self.p_proj(y)
        return y
    
class FFN(nn.Module):
    def __init__(self, embed_size):
            super().__init__()
            self.linear1  = nn.Linear(embed_size, embed_size*4)
            torch.nn.init.xavier_uniform_(self.linear1.weight)

            self.linear2 = nn.Linear(embed_size*4, embed_size)
            torch.nn.init.xavier_uniform_(self.linear2.weight)

            self.gelu = nn.GELU()
            self.dropout = nn.Dropout(0.1)
        
    def forward(self, x):
            x = self.linear1(x)
            x = self.gelu(x)
            x = self.dropout(x)
            x = self.linear2(x)
            x = self.dropout(x)
            return x

class EncoderLayer(nn.Module):
    def __init__(self, embed_size, nhead, dropout,  bias=True, eps=1e-06):
        super().__init__()    
        self.selfattn = SelfAttention(embed_size, nhead, dropout)
        self.ffn = FFN(embed_size)
        self.dropout = nn.Dropout(0.1)
        self.norm = LayerNorm(embed_size, bias, eps)

    def forward(self, x, mask):
        # x  [bs, seqlen, embed_size]
        _x = x
        x = x +  self.dropout(self.selfattn(x, mask))
        x = self.norm(x)
        _x = x
        x = x + self.dropout(self.ffn(x))
        x = self.norm(x)
        return x, mask



class TransformerEnc(nn.Module):
    def __init__(self, embed_size,nhead, num_layers =3, c_len = 96, device='cuda'):
        super(TransformerEnc, self).__init__()
        '''
            1. Encoder question and context 
            2. CNN to get the local conte
        '''
        self.qencoder = nn.ModuleList([EncoderLayer(embed_size, nhead, 0.1).to(device) for _ in range(num_layers)])
        self.cencoder = nn.ModuleList([EncoderLayer(embed_size, nhead, 0.1).to(device) for _ in range(num_layers)])
        
        self.Wsim = nn.Linear(embed_size*3, 1)
        torch.nn.init.xavier_uniform_(self.Wsim.weight)

        self.Wdistil = nn.Linear(embed_size*4, embed_size)
        torch.nn.init.xavier_uniform_(self.Wsim.weight)

        self.synin4 = [EncoderLayer(embed_size*4, nhead, 0.1 ).to(device) for _ in range(num_layers)]

        self.Whead1 = nn.Linear(embed_size*4, 1)
        torch.nn.init.xavier_uniform_(self.Whead1.weight)

        self.Whead2 = nn.Linear(c_len, 2)
        torch.nn.init.xavier_uniform_(self.Whead2.weight)

        self.dropout = nn.Dropout(0.1)
       
    def forward(self, c, q, q_mask, c_mask):
        # q_mask = q_mask.unsqueeze(-1)
        # c_mask = c_mask.unsqueeze(-1)
        # q_mask, c_mask [bs, seqlen]
        for layer in self.qencoder:
            q, q_mask = layer(q, q_mask)
        for layer in self.qencoder:
            c, c_mask = layer(c, c_mask)
        # c = self.encoder(c, c_mask) # [bs, c_len, embed_size]
        # q = self.encoder(q, q_mask) # [bs, q_len, embed_size]
        # caculate similarity matrix
        bs = c.size(0)
        c_len = c.size(1)
        q_len = q.size(1)

        c_sim = c.unsqueeze(2).expand(-1, -1, q_len, -1) # [bs, c_len, q_len, embed_size]
        q_sim = c.unsqueeze(1).expand(-1, c_len, -1, -1) # [bs, c_len, q_len, embed_size] 
        
        cq_sim = torch.mul(c_sim, q_sim) # [bs, c_len, q_len, embed_size]
        cqcq = torch.cat([c_sim, q_sim, cq_sim], dim=-1) # [bs, c_len, q_len, 3*embed_size]
        S = self.Wsim(cqcq).squeeze(-1) # similarity matrix [bs, c_len, q_len]
        
        # can meet  error such as the shape of mask can't be broadcastable with the shape of the tensor
        # can fix by unsqueeze the mask at dim = -1 of q_mask and c_mask
        # q_mask = q_mask.unsqueeze(-1)

        S_row = S.masked_fill_(q_mask.unsqueeze(1) == 0, -10000)
        S_row = F.softmax(S_row, dim=-1) # [bs, c_len, q_len]
        A = torch.bmm(S_row, q) # [bs, c_len, embed_size]

        # c_mask = c_mask.unsqueeze(-1)
        S_col = S.masked_fill_(c_mask.unsqueeze(2) == 0, -10000)
        S_col = F.softmax(S_col, dim = 1) # [bs, c_len, q_len]

        B = torch.bmm(torch.bmm(S_col, S_row.transpose(1,2)), c) # [bs, c_len, embed_size]

        distil = torch.cat([c, A, torch.mul(c, A), torch.mul(c, B)], dim = -1) # [bs, c_len, 4*embed_size]
        # distil = self.Wdistil(distil) # distil information [bs, c_len, embed_size]
        for layer in self.synin4:
            distil, c_mask = layer(distil, c_mask)
        # synin4 = self.synin4(distil) 
        out1 = self.Whead1(distil)

        out1 = out1.squeeze(-1)
        out2 = self.Whead2(self.dropout(out1))

        return out2




In [5]:
dataset = load_dataset("nyu-mll/glue", "qnli" )


In [7]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 104743
    })
    validation: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
    test: Dataset({
        features: ['question', 'sentence', 'label', 'idx'],
        num_rows: 5463
    })
})

In [9]:
random_train = dataset['train'].select(range(2269,12269))
random_val = dataset['validation'].select(range(2269,3269))
random_test = dataset['validation'].select(range(3269, 4269))

In [None]:
max_length = 128

In [None]:
train_data = SquadDataset(random_train, 32, tokenizer, max_length)
validation_data = SquadDataset(random_val, 32, tokenizer, max_length)
test_data = SquadDataset(random_test, 32, tokenizer, max_length)

In [11]:
class NAQNLI(nn.Module):
	def __init__(self, config):
		super(NAQNLI, self).__init__()
		self.w2v = WordEmbedding(config['vocab_size'], config['embed_size'], config['c_len'], config['device'],  config['BERT'])
		self.enc = TransformerEnc(config['embed_size'], config['nhead'], config['num_layers'], config['c_len'], config['device'])
		self.dropout = nn.Dropout(0.1)
	def forward(self, c, q, q_mask, c_mask):
		q = self.w2v(q)
		q = self.drop(q)
		c = self.w2v(c)
		c - self.drop(c)
		return self.enc(c, q, q_mask, c_mask)


In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
device

device(type='cpu')

In [13]:
BERT = True

In [None]:
config = {
    'vocab_size': tokenizer.vocab_size,
    'embed_size': 768 if BERT else 256,
    'nhead': 12,
    'num_layers': 4,
    'c_len': 128,
    'device': device,
    'BERT': BERT
}

In [14]:
model = NAQNLI(config).to(device)
model = torch.compile(model)

embedding size must be 768


In [15]:
optimizer = optim.Adadelta(model.parameters(), lr=0.5, weight_decay=0.0001)
critereon = nn.CrossEntropyLoss().to(device)

In [16]:
def train(model, train_data, optimizer, critereon, epochs):
	t0 = time.time()
	for epoch in range(epochs):
		model.train()
		running_loss = 0.0
		for q, c, labels in (train_data):
			model.zero_grad()
			q_i = q['input_ids'].to(device)
			c_i = c['input_ids'].to(device)
			q_mask = q['attention_mask'].to(device)
			c_mask = c['attention_mask'].to(device)
			# with torch.autocast(device_type=device, dtype=torch.float16):
			output = model(c_i, q_i, q_mask, c_mask)
			labels = labels.long().to(device)
			loss = critereon(output,labels )
			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
			optimizer.step()
			running_loss += loss.item()
		print(f"Epoch {epoch} Loss: {loss.item()/len(train_data)}")
	t1 = time.time()		
	print(f"Training time: {t1-t0}")
	

In [None]:
train(model, train_data, optimizer, critereon, 5)

In [None]:
torch.save(model.state_dict(), '/kaggle/working/model1.pth')

In [None]:
# Assuming `model` is an instance of the same architecture you trained earlier
model.load_state_dict(torch.load('model_weights.pth'))

In [None]:
optimzer = torch.optim.Adam(model.parameters(), lr=0.006,betas=(0.8, 0.999), eps=1e-07, weight_decay=0.0001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_data)*20, eta_min=0.006*0.1, last_epoch=-1, verbose=False)

In [None]:
def evaluation(model, validation_data, critereon):
	model.eval()
	running_loss = 0.0
	# with torch.no_grad():
	for q, c, labels in (validation_data):
			q_i = q['input_ids'].to(device)
			c_i = c['input_ids'].to(device)
			q_mask = q['attention_mask'].to(device)
			c_mask = c['attention_mask'].to(device)
			with torch.no_grad():
			# with torch.autocast(device_type=device, dtype=torch.bfloat16):
				output = model(c_i, q_i, q_mask, c_mask)
				labels = labels.long().to(device)
				loss = critereon(output.view(-1, 2),labels.view(-1))
				running_loss += loss.item()
				_, predicted = torch.max(output, 1, dim=-1, keepdim=True)
				total += labels.size(0)
				correct += (predicted == labels).sum().item()
	print(f"Validation Loss: {running_loss/len(validation_data)}")
	print(f"Accuracy: {correct/total}")

In [None]:
evaluation(model, validation_data, critereon)

In [None]:
torch.cuda.empty_cache()

In [None]:
def predict(model, questions, sentences, tokenizer, max_len):
	model.eval()
	with torch.no_grad():
	q = tokenizer(questions, max_length = max_len, padding='max_length', truncation=True, return_tensors='pt')
	c = tokenizer(sentences, max_length = max_len, padding='max_length', truncation=True, return_tensors='pt')
	q_i = q['input_ids'].to(device)
	c_i = c['input_ids'].to(device)
	q_mask = q['attention_mask'].to(device)
	c_mask = c['attention_mask'].to(device)
	output = model(c_i, q_i, q_mask, c_mask)
	_, predicted = torch.max(output, 1, dim=-1, keepdim=True)
	return predicted