In [1]:
import torch

In [2]:
a = torch.tensor([[1,1,0], [1,1,1], [1, 0, 0]])
a.unsqueeze(-1).expand(-1,-1, 4)

tensor([[[1, 1, 1, 1],
         [1, 1, 1, 1],
         [0, 0, 0, 0]],

        [[1, 1, 1, 1],
         [1, 1, 1, 1],
         [1, 1, 1, 1]],

        [[1, 1, 1, 1],
         [0, 0, 0, 0],
         [0, 0, 0, 0]]])

In [3]:
torch.randn(3,4,5).dim()

3

In [7]:
b = torch.ones(2,1,4,4 )

In [8]:
torch.tril(b, diagonal=0)

tensor([[[[1., 0., 0., 0.],
          [1., 1., 0., 0.],
          [1., 1., 1., 0.],
          [1., 1., 1., 1.]]],


        [[[1., 0., 0., 0.],
          [1., 1., 0., 0.],
          [1., 1., 1., 0.],
          [1., 1., 1., 1.]]]])

In [1]:
from transformers import BertTokenizer, BertModel



In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')


In [3]:
source_sentences = ["Hello world", "How are you", "I am fine"]
a = tokenizer(source_sentences, padding="max_length", truncation=True, return_tensors='pt', max_length=10)

In [5]:
a['input_ids']

tensor([[  101, 31178, 11356,   102,     0,     0,     0,     0,     0,     0],
        [  101, 14962, 10301, 13028,   102,     0,     0,     0,     0,     0],
        [  101,   146, 10392, 13435,   102,     0,     0,     0,     0,     0]])

In [1]:
en = []
with open('/home/trnmah/final_projectDL/src/MT/data/train-en-vi/train.en', 'r', encoding='utf-8') as file:
	for line in file:
		en.append(line.strip())  # strip() removes trailing newline characters

vi = []
with open('/home/trnmah/final_projectDL/src/MT/data/train-en-vi/train.vi', 'r', encoding='utf-8') as file:
	for line in file:
		vi.append(line.strip())  # strip() removes trailing newline characters

In [2]:
en_valid = []
with open('/home/trnmah/final_projectDL/src/MT/data/dev-2012-en-vi/tst2012.en', 'r', encoding='utf-8') as file:
	for line in file:
		en_valid.append(line.strip())  # strip() removes trailing newline characters

vi_valid = []
with open('/home/trnmah/final_projectDL/src/MT/data/dev-2012-en-vi/tst2012.vi', 'r', encoding='utf-8') as file:
	for line in file:
		vi_valid.append(line.strip())  # strip() removes trailing newline characters

In [3]:
print(en[:5] )
print(vi[:5] )
print(en_valid[:5] )
print(vi_valid[:5] )

['Rachel Pike : The science behind a climate headline', 'In 4 minutes , atmospheric chemist Rachel Pike provides a glimpse of the massive scientific effort behind the bold headlines on climate change , with her team -- one of thousands who contributed -- taking a risky flight over the rainforest in pursuit of data on a key molecule .', 'I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper .', 'Headlines that look like this when they have to do with climate change , and headlines that look like this when they have to do with air quality or smog .', 'They are both two branches of the same field of atmospheric science .']
['Khoa học đằng sau một tiêu đề về khí hậu', 'Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- m

In [4]:
train_data_src = en[2269:(2269+4096)]
train_data_trg= vi[2269:(2269+4096)]
valid_data_src = en_valid[269:(269+512)]
valid_data_trg= vi_valid[269:(269+512)]
test_data_src = en_valid[4:(4+256)]
test_data_trg= vi_valid[4:(4+256)]



In [5]:
from data import SentenceDataset
from model import TransformerMT
from transformers import BertTokenizerFast
from w2v import WordEmbedding
import torch.optim as optim
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
import time



In [6]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

In [7]:
train_data = SentenceDataset(train_data_src, train_data_trg, tokenizer, max_length=128)
valid_data = SentenceDataset(valid_data_src, valid_data_trg, tokenizer, max_length=128)
test_data = SentenceDataset(test_data_src, test_data_trg, tokenizer, max_length=128)

In [8]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [10]:
tokenizer.vocab_size

119547

In [11]:
args = {
	'embed_size': 256,
	'num_layers': 4,
	'max_len' : 128,
	'nhead': 4,
	'dropout': 0.1,
	'vocab_size': tokenizer.vocab_size,
	'BERT': False,
	'device': device
}

In [17]:
class MultiLayerTransformerMT(nn.Module):
	def __init__(self, args):
		super(MultiLayerTransformerMT, self).__init__()
		self.embeddings = WordEmbedding(args['vocab_size'], args['embed_size'], args['max_len'], args['device'], args['BERT'])
		self.transformer = nn.ModuleList([TransformerMT(args) for _ in range(args['num_layers'])])
		self.head = nn.Linear(args['embed_size'], args['vocab_size'])
	def forward(self, src, tgt, src_mask, tgt_mask):
		src = self.embeddings(src)
		tgt = self.embeddings(tgt)
		for layer in self.transformer:
			src, tgt, src_mask, tgt_mask = layer(src, tgt, src_mask, tgt_mask)
		tgt = self.head(tgt)
		return tgt.reshape(-1, tgt.size(-1))



In [18]:
model = MultiLayerTransformerMT(args).to(device)
model = torch.compile(model)
optimizer = optim.AdamW(model.parameters(), lr=0.001,  weight_decay=0.01)
	

In [19]:
critertion = nn.CrossEntropyLoss().to(device)

In [20]:
def train (model, data, optimizer, critertion, device, epochs=1):
	model.train()
	start = time.time()
	running_loss = 0
	for j in range(epochs):
		for i, batch in enumerate(data):
			src = batch['src'].to(device)
			tgt = batch['tgt'].to(device)
			src_mask = batch['src_mask'].to(device)
			tgt_mask = batch['tgt_mask'].to(device)
			optimizer.zero_grad()
			with torch.autocast(device_type=device, dtype=torch.bfloat16):
				output = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :-1])
				# output = output.view(-1, output.size(-1))
				loss = critertion(output, tgt[:, 1:].contiguous().view(-1))
			loss.backward()
			optimizer.step()
			torch.cuda.synchronize()
			running_loss += (loss.item())
			if (i+1) % 1000 == 0:
				print(f'Epoch: {j}, step: {i}, Loss: {loss.item()/i}')
	end = time.time()
	print(f'Time taken: {end-start}')

In [21]:
train(model, train_loader, optimizer, critertion, device, epochs=5)

KeyboardInterrupt: 

In [None]:
import time
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [None]:
def evaluation(model, data, critertion, device):
	model.eval()
	running_loss = []
	t0 = time.time()
	for i, batch in enumerate(data):
			src = batch['src'].to(device)
			tgt = batch['tgt'].to(device)
			src_mask = batch['src_mask'].to(device)
			tgt_mask = batch['tgt_mask'].to(device)
			with torch.no_grad():
				output = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :-1]) # output = [batch_size, tgt_len-1, vocab_size]
				loss = critertion(output.view(), tgt[:, 1:].view(-1))

				topk_prob, topk_ids = output.topk(k=3, dim=-1)
				id = torch.multinomial(topk_prob, num_samples=1)
				xcol = torch.gather(topk_ids, -1, id) # 
				output = output.squeeze(-1).view(-1, src.size(1))
				# Calculate BLEU for each sentence and accumulate
				for ref, pred in zip(tgt[:, :-1], output):
					bleu_score += sentence_bleu([ref.cpu().numpy().tolist()], pred.cpu().numpy().tolist(), smoothing_function=SmoothingFunction().method4)
			total_samples += src.size(0)				
			running_loss += loss.item()
	t1 = time.time()
	print(f"Training time: {t1-t0}, Loss: {running_loss/len(data)}, BLEU: {bleu_score/total_samples}")

In [None]:
def get_score_bleu()

In [17]:
tokenizer.cls_token_id

101

In [None]:
def generate_translation(model, sentences, device):
	model.eval()
	src = tokenizer(sentence, padding="max_length", truncation=True, return_tensors='pt', max_length=128)
	src = src['input_ids'].to(device)
	src_mask = src['attention_mask'].to(device)    

	sample_rng = torch.Generator(device=device)
	sample_rng.manual_seed(123)    
	tgt = [tokenizer.cls_token_id]
	for sentence in sentences:
		tgt_mask = torch.ones(tgt.shape[0], tgt.shape[1], device=device).tril(diagonal=0)
		with torch.no_grad():
			output = model(src, tgt, src_mask, tgt_mask)
			next_token = output[:, -1, :]
			next_token = F.softmax(next_token, dim=-1)
			topk_prob, topk_idx = torch.topk(next_token, k=8, dim=-1)
			id  = torch.multinomial(topk_prob, num_samples=1, generator=sample_rng)
			actual_token = topk_idx.gather(dim=-1, index=id)
			tgt.append(actual_token)
	return tgt

In [None]:
def accuracy(model, data, device):
	model.eval()
	running_loss = []

	with torch.no_grad():
		for i, batch in enumerate(data):
			src = batch['src'].to(device)
			tgt = batch['tgt'].to(device)
			src_mask = batch['src_mask'].to(device)
			tgt_mask = batch['tgt_mask'].to(device)
			output = model(src, tgt[:, :-1], src_mask, tgt_mask[:, :-1])
			loss = critertion(output, tgt[:, 1:].view(-1))
			output = output.argmax(dim=-1)


In [2]:
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
a = torch.randn(3,4)
a[:, 1].shape

torch.Size([3])

In [4]:
a

tensor([[ 0.7775,  1.0295,  0.0726, -0.1331],
        [ 0.2277, -0.3164,  0.5758, -0.4550],
        [-1.9309,  1.7030, -0.3719, -0.1745]])

In [10]:
model = torch.nn.Embedding(10,3)

In [13]:
topk_prob, topk_idx = torch.topk(a, k=1, dim=-1)
print(topk_prob,"\n", topk_prob.shape,  "\n", model(topk_idx).shape, '\n', model(topk_idx.squeeze(-1)).shape)

tensor([[1.0295],
        [0.5758],
        [1.7030]]) 
 torch.Size([3, 1]) 
 torch.Size([3, 1, 3]) 
 torch.Size([3, 3])
