In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from data import SentenceDataset
import time
from transformers import AutoConfig, get_linear_schedule_with_warmup

In [5]:
torch.cuda.is_available()


False

In [2]:
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
# from transformers import BertTokenizerFast

class SentenceDataset(Dataset):
	def __init__(self, src_sentence, tgt_sentence, tokenizer, max_length):
		self.src = src_sentence 
		self.tgt = tgt_sentence
		self.tokenizer = tokenizer
		self.max_length = max_length 

	# def get_tokenized_sentences(self, source_sentence, target_sentence):
	#     tokenized_sentence = self.tokenizer(source_sentence, text_target =target_sentence, padding='max_length', truncation=True, return_tensors="pt", max_length=self.max_length)
	#     return tokenized_sentence

	def __len__(self):
		return len(self.src)
	
	def __getitem__(self, idx):
		inputs = self.tokenizer(self.src[idx], text_target = self.tgt[idx], padding='max_length', truncation=True, return_tensors="pt", max_length=self.max_length)

		return {
			'input_ids': inputs['input_ids'].squeeze(),
			'attention_mask': inputs['attention_mask'].squeeze(),
			'labels': inputs['labels'].squeeze()
		}

In [2]:
en = []
with open('/home/trnmah/final_projectDL/src/MT/data/train-en-vi/train.en', 'r', encoding='utf-8') as file:
	for line in file:
		en.append(line.strip())  # strip() removes trailing newline characters

vi = []
with open('/home/trnmah/final_projectDL/src/MT/data/train-en-vi/train.vi', 'r', encoding='utf-8') as file:
	for line in file:
		vi.append(line.strip())  # strip() removes trailing newline characters
		
en_valid = []
with open('/home/trnmah/final_projectDL/src/MT/data/dev-2012-en-vi/tst2012.en', 'r', encoding='utf-8') as file:
	for line in file:
		en_valid.append(line.strip())  # strip() removes trailing newline characters

vi_valid = []
with open('/home/trnmah/final_projectDL/src/MT/data/dev-2012-en-vi/tst2012.vi', 'r', encoding='utf-8') as file:
	for line in file:
		vi_valid.append(line.strip())  # strip() removes trailing newline characters

train_data_src = en[2269:(2269+4096)]
train_data_trg= vi[2269:(2269+4096)]
valid_data_src = en_valid[269:(269+512)]
valid_data_trg= vi_valid[269:(269+512)]
test_data_src = en_valid[4:(4+256)]
test_data_trg= vi_valid[4:(4+256)]

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  return torch._C._cuda_getDeviceCount() > 0


In [4]:
tokenizer = AutoTokenizer.from_pretrained("vinai/vinai-translate-en2vi-v2", src_lang="en_XX", tgt_lang="vi_VN")
model = AutoModelForSeq2SeqLM.from_pretrained("vinai/vinai-translate-en2vi-v2")
model.to(device)

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(66773, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(66773, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): 

In [49]:
test = tokenizer("hello", text_target='Xin chào', padding='max_length', truncation=True, return_tensors="pt", max_length=10)

In [50]:
print(tokenizer.decode(test['input_ids'].squeeze()), tokenizer.decode(test['labels'].squeeze()))

hello</s>en_XX<pad><pad><pad><pad><pad><pad> Xin chào</s>vi_VN<pad><pad><pad><pad><pad><pad>


In [51]:
test['input_ids']

tensor([[ 6268,    89,     2, 66750,     1,     1,     1,     1,     1,     1]])

In [77]:
train_dataset = SentenceDataset(train_data_src, train_data_trg, tokenizer, 32)
valid_dataset = SentenceDataset(valid_data_src, valid_data_trg, tokenizer, 128)
test_dataset = SentenceDataset(test_data_src, test_data_trg, tokenizer, 128)

In [78]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=32, shuffle=False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False)

In [54]:
# Setup the optimizer
optimizer = optim.AdamW(model.parameters(), lr=3e-4, eps=1e-6, betas=(0.9,0.98), weight_decay =0.00001)

# Number of training epochs
epochs = 1

# Total number of training steps is [number of batches] x [number of epochs]
total_steps = len(train_loader) * epochs

In [55]:
scheduler = get_linear_schedule_with_warmup(optimizer, 
											num_warmup_steps=int(0.01*total_steps),
											num_training_steps=total_steps)

In [79]:
# Training loop
def train(model, loader, optimizer, scheduler, epochs, device):
	# # Set the seed value all over the place to make this reproducible.
	# seed_val = 42
	# random.seed(seed_val)
	# np.random.seed(seed_val)
	# torch.manual_seed(seed_val)
	# torch.cuda.manual_seed_all(seed_val)

	# Store the average loss after each epoch so we can plot them.
	loss_values = []
	for epoch_i in range(0, epochs):
		# Perform one full pass over the training set.
		print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
		# Measure how long the training epoch takes.
		t0 = time.time()
		
		# Reset the total loss for this epoch.
		total_loss = 0
		
		# Put the model into training mode.
		model.train()

		for step, batch in enumerate(loader):
			# Progress update every 40 batches.
			if step % 200 == 0 and not step == 0:
				print('  Batch {:>1,}  of  {:>1,}.    Elapsed: , Loss {:}'.format(step, len(loader) , total_loss / (step+1)))
			# `batch` contains three pytorch tensors:
			#   [0]: input ids 
			# 	[1]: token_type_ids
			#   [2]: attention masks
			#   [3]: labels 
			# inputs = batch['inputs'].to(device)
			# targets = batch['labels'].to(device)
			# inputs = batch
			
			# Always clear any previously calculated gradients before performing a backward pass.
			model.zero_grad()        
			
			# Perform a forward pass (evaluate the model on this training batch).
			# This will return the loss (rather than the model output) because we have provided the `labels`.
			outputs = model(**batch)
			loss = outputs.loss
	
			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
			optimizer.step()
			scheduler.step()
			total_loss += loss.item()
		avg_train_loss = total_loss / len(loader)
		print(f"Average training loss: {avg_train_loss:.2f}")

In [80]:
train(model, train_loader, optimizer, scheduler, epochs, device)



KeyboardInterrupt: 

In [None]:
help(tokenizer)	

In [27]:
print(tokenizer.decode(tokenizer.prefix_tokens), tokenizer.decode(tokenizer.suffix_tokens))

 </s>en_XX


In [10]:
print(vi[1])

Trong 4 phút , chuyên gia hoá học khí quyển Rachel Pike giới thiệu sơ lược về những nỗ lực khoa học miệt mài đằng sau những tiêu đề táo bạo về biến đổi khí hậu , cùng với đoàn nghiên cứu của mình -- hàng ngàn người đã cống hiến cho dự án này -- một chuyến bay mạo hiểm qua rừng già để tìm kiếm thông tin về một phân tử then chốt .


In [5]:
# evaluate the model, get predictions and actuals
# BLEU score
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction

def compute_bleu(predictions, actuals):
	# use the compute method of the BLEU metric
	bleu_score = corpus_bleu(list_of_references=[[actuals]], hypotheses=[predictions], smoothing_function=SmoothingFunction().method4) * 100
	return bleu_score


In [None]:
def generate_predictions(model, loader, device):
	predictions = []
	actuals = []
	running_loss = 0
	for batch in loader:
		# Add batch to GPU
		batch = {k: v.to(device) for k, v in batch.items()}
		# Telling the model not to compute or store gradients, saving memory and
		# speeding up prediction
		with torch.no_grad():
			outputs = model(**batch)
			running_loss += outputs.loss

		# Get the top k largest predicted token ids
		# topk_probas, topk_ids = torch.topk(outputs.logits, 5)

		# If we have a batch size of more than 1, we need to flatten the predictions
		# sampling 1 ids from the topk ids
		# ids = torch.multinomial(F.softmax(topk_probas, dim=-1), num_samples=1)

		# map the ids to the actual tokens
		# actuals_ids = torch.gather(input=topk_ids ,dim=-1, index=ids).squeeze() # shape (batch_size, 1)
		# predicted_tokens = torch.argmax(outputs.logits, dim=2)
		# If we have a batch size of more than 1, we need to flatten the predictions
		# and the target labels to be able to use the compute method from the
		# datasets object
		# predictions.extend(predicted_tokens)
		# predictions.extend(actuals_ids) #
		# actuals.extend(batch["labels"])
	return running_loss/len(loader)

In [1]:
import torch

In [19]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [6]:
def translate_en2vi(en_text: str, tokenizer_en2vi, model_en2vi, max_len) -> str:
	input_ids = tokenizer_en2vi(en_text, padding = 'max_length', truncation = True, max_length = max_len,  return_tensors="pt").input_ids
	input_ids = input_ids.to(device)
	output_ids = model_en2vi.generate(
		input_ids,
		decoder_start_token_id=tokenizer_en2vi.lang_code_to_id["vi_VN"],
		num_return_sequences=1,
		num_beams=5,
		early_stopping=True
	)
	vi_text = tokenizer_en2vi.batch_decode(output_ids, skip_special_tokens=True)
	# vi_text = " ".join(vi_text)
	return vi_text

en_text = ["How are you? Are you okay?", "Nice to meet you"]
print(translate_en2vi(en_text, tokenizer, model, 32))

# en_text = "i haven't been to a public gym before when i exercise in a private space i feel more comfortable"
# print(translate_en2vi(en_text))

['Anh khỏe không?', 'Rất vui được gặp anh.']


In [7]:
# generation and calculation of BLEU score
def generate_and_calc_bleu( dataset, target, batch_size, tokenizer, model, device):
	model.eval()
	for i in range(0, len(dataset), batch_size):
		batch = dataset[i:i+batch_size]
		# Add batch to GPU
		# batch only contains raw list of sentences
		vi_predict = translate_en2vi(batch	, tokenizer, model, 128)
		# Telling the model not to compute or store gradients, saving memory and
		# speeding up prediction
		vi_target = [[s] for s in target]
		bleu_score = compute_bleu(vi_predict, vi_target)
		return bleu_score

In [9]:
torch.cuda.empty_cache()

In [10]:
generate_and_calc_bleu(valid_data_src, valid_data_trg, 32, tokenizer, model, device)

KeyboardInterrupt: 

In [1]:
import torch; torch.cuda.is_available()

  return torch._C._cuda_getDeviceCount() > 0


False