In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
# from data import SentenceDataset
# from w2v import Word2Vec
# from model import EncoderRNN, DecoderRNN
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import time

In [2]:
import sys
print(sys.version)

3.10.13 | packaged by conda-forge | (main, Dec 23 2023, 15:36:39) [GCC 12.3.0]


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
print(torch.version.cuda)


12.1


In [5]:
class SentenceDataset(Dataset):
    def __init__(self, src_sentence, tgt_sentence, tokenizer, max_length):
        self.src = src_sentence 
        self.tgt = tgt_sentence
        self.tokenizer = tokenizer
        self.max_length = max_length 

    def get_tokenized_sentences(self, sentence):
        tokenized_sentence = self.tokenizer(sentence, padding='max_length', truncation=True, return_tensors="pt", max_length=self.max_length)
        return tokenized_sentence['input_ids']

    def __len__(self):
        return len(self.src)
    
    def __getitem__(self, idx):
        tokenized_src = self.get_tokenized_sentences(self.src[idx])
        tokenized_tgt = self.get_tokenized_sentences(self.tgt[idx])
        return {
            'src': tokenized_src.squeeze(0),
            'tgt': tokenized_tgt.squeeze(0),
        }

In [6]:
class Word2Vec(nn.Module):
	def __init__(self, vocab_size, embed_size, BERT = False): 
		super(Word2Vec, self).__init__()
		if BERT:
			model = BertModel.from_pretrained('bert-base-multilingual-cased')
			self.embeddings = model.embeddings.word_embeddings
			self.embeddings.requires_grad_(False)
		else:	
			self.embeddings = nn.Embedding(vocab_size, embed_size)
			torch.nn.init.normal_(self.embeddings.weight, mean=0, std=0.02)
	def forward(self, x):
		x = self.embeddings(x)
		return x

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class EncoderRNN(nn.Module):
	def __init__(self,vocab_size, input_size, hidden_size, BERT, dropout=0.1):
		super(EncoderRNN, self).__init__()
		self.embedding = Word2Vec(vocab_size, input_size, BERT)
		self.hidden_size = hidden_size
		self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
		self.dropout = nn.Dropout(dropout)

	def forward(self, input):
		embedded = self.dropout(self.embedding(input))
		output, hidden = self.rnn(embedded)
		return output, hidden

# We should input both encoder and decoder is embedding vector, not input index

class DecoderRNN(nn.Module):
	def __init__(self, vocab_size, input_size, hidden_size, sos_token, max_length, BERT, generator):
		super(DecoderRNN, self).__init__()
		self.rnn= nn.RNN(input_size, hidden_size, batch_first=True)
		self.out = nn.Linear(hidden_size, vocab_size)
		self.sos_token = sos_token # Start of Sentence token
		self.max_length = max_length # Maximum length of the output sequence
		self.embedding = Word2Vec(vocab_size, input_size, BERT)
		self.generator = generator

	def forward(self, encoder_outputs, encoder_hidden, device, target_tensor=None):
		batch_size = encoder_outputs.size(0)
		# decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(self.sos_token)
		decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=device).fill_(self.sos_token)
		decoder_hidden = encoder_hidden
		decoder_outputs = []
		# target_tensor_size= target_tensor.size(1) # length of the target sequence
		generated_tokens = decoder_input

		for i in range(self.max_length): # input not include sos token, it range from 1 to max_length-1
			decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
			decoder_outputs.append(decoder_output)

			if target_tensor is not None:
				# Teacher forcing: Feed the target as the next input
				decoder_input = target_tensor[:, i].unsqueeze(-1)  # Teacher forcing
			else:
				# Without teacher forcing: use its own predictions as the next input
				# _, topi = decoder_output.topk(1) 
				decoder_output = decoder_output.squeeze(1) # decoder_output.view(-1, decoder_output.size(-1))  | // decoder_output: [bs, 1, vocab_size] -> [bs, vocab_size
				topk_pros, topk_ids  = decoder_output.topk(5, dim=-1) # topk_ids: [batch_size, 5]
				ix = torch.multinomial(topk_pros, num_samples=1, generator=self.generator) # sample from the topk_pros [batch_size, 1]
				xcol = torch.gather(topk_ids, -1, ix) # gather the topk_ids with the index ix

				decoder_input = xcol.detach()  # detach from history as input to the next time step
				generated_tokens = torch.cat((generated_tokens, decoder_input), dim=1)


		decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
		decoder_outputs.append(decoder_output)
		
		decoder_outputs = torch.cat(decoder_outputs, dim=1)
		return decoder_outputs, generated_tokens

	def forward_step(self, input, hidden):
		output = self.embedding(input)
		output = F.relu(output)
		output, hidden = self.rnn(output, hidden)
		output = self.out(output)
		return output, hidden

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [25]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load data
en = []
with open('/kaggle/input/machinetranslationenvi/train.en', 'r', encoding='utf-8') as file:
	for line in file:
		en.append(line.strip())  # strip() removes trailing newline characters

vi = []
with open('/kaggle/input/machinetranslationenvi/train.vi', 'r', encoding='utf-8') as file:
	for line in file:
		vi.append(line.strip())  # strip() removes trailing newline characters
		
en_valid = []
with open('/kaggle/input/machinetranslationenvi/tst2012.en', 'r', encoding='utf-8') as file:
	for line in file:
		en_valid.append(line.strip())  # strip() removes trailing newline characters

vi_valid = []
with open('/kaggle/input/machinetranslationenvi/tst2012.vi', 'r', encoding='utf-8') as file:
	for line in file:
		vi_valid.append(line.strip())  # strip() removes trailing newline characters

train_data_src = en[2269:(2269+4096)]
train_data_trg= vi[2269:(2269+4096)]
valid_data_src = en_valid[269:(269+512)]
valid_data_trg= vi_valid[269:(269+512)]
test_data_src = en_valid[4:(4+256)]
test_data_trg= vi_valid[4:(4+256)]

train_data = SentenceDataset(train_data_src, train_data_trg, tokenizer, max_length=64)
valid_data = SentenceDataset(valid_data_src, valid_data_trg, tokenizer, max_length=64)
test_data = SentenceDataset(test_data_src, test_data_trg, tokenizer, max_length=64)

In [26]:
train_data[0]['src'].shape

torch.Size([64])

In [61]:
train_loader = DataLoader(train_data, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=64, shuffle=False)
test_loader = DataLoader(test_data, batch_size=32, shuffle=False)


In [28]:
class Seq2Seq(nn.Module):
	def __init__(self, config):
		super(Seq2Seq, self).__init__()

		self.encoder = EncoderRNN(config['vocab_size'],config['input_size'], config['hidden_size'], \
							 config['BERT'], config['dropout'])
		self.decoder = DecoderRNN(config['vocab_size'], config['input_size'], config['hidden_size'], \
							config['sos_token'], config['max_length'] ,config['BERT'], config['generator'] )
		self.device = config['device']
	
	def forward(self, src, tgt):
		encoder_output, encoder_hidden = self.encoder(src)
		decoder_output = self.decoder(encoder_output, encoder_hidden, self.device, tgt)
		return decoder_output # [bs, seqlen, vocab_size]

In [29]:
generator = torch.Generator(device=device)
generator.manual_seed(42+222)

<torch._C.Generator at 0x785fba49ea70>

In [30]:
BERT = False

In [55]:
config = {
    'vocab_size': tokenizer.vocab_size,
    'input_size': 768 if BERT else 128 ,
    'hidden_size': 256,
	'BERT': BERT,
	'dropout': 0.1,
	'sos_token': tokenizer.convert_tokens_to_ids('[CLS]'),
	'max_length': 64-2,
	'device' : device,
    'generator': generator

}

In [56]:
model = Seq2Seq(config).to(device)


In [57]:
# model = torch.compile(model)

In [58]:
optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=0.0001)

In [59]:
critertion = nn.CrossEntropyLoss().to(device)

In [42]:
MixedPrecision = torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16
print(MixedPrecision, "\n", device)

torch.float16 
 cuda


In [43]:
torch.cuda.is_bf16_supported()

False

In [62]:
def train (model, data, optimizer, critertion, device, epochs=1):
	model.train()
	start = time.time()
	running_loss = 0
	for j in range(epochs):
		for i, batch in enumerate(data):
			src = batch['src'].to(device)
			tgt = batch['tgt'].to(device)
			optimizer.zero_grad()
			with torch.autocast(device_type=device, dtype=MixedPrecision):
				output, _ = model(src, tgt[:, 1:-1])
				output = output.reshape(-1, output.size(-1))
				loss = critertion(output, tgt[:, 1:].contiguous().view(-1))
			loss.backward()
			optimizer.step()
			torch.cuda.synchronize()
			running_loss += (loss.item())
			if (i+1) % 10 == 0:
				print(f'Epoch: {j}, step: {i}, Loss: {loss.item()/i}')
	end = time.time()
	print(f'Time: {end-start}, Loss: {running_loss/len(data)}')

In [63]:
train(model, train_loader, optimizer, critertion, 'cuda', epochs=10)

Epoch: 0, step: 9, Loss: 0.8977607091267904
Epoch: 0, step: 19, Loss: 0.27186406286139236
Epoch: 0, step: 29, Loss: 0.12406638572955954
Epoch: 0, step: 39, Loss: 0.09667082321949494
Epoch: 0, step: 49, Loss: 0.06533774064511669
Epoch: 0, step: 59, Loss: 0.053951489723334876
Epoch: 1, step: 9, Loss: 0.3614708052741157
Epoch: 1, step: 19, Loss: 0.21360309500443309
Epoch: 1, step: 29, Loss: 0.11652185999113938
Epoch: 1, step: 39, Loss: 0.08890020541655712
Epoch: 1, step: 49, Loss: 0.0752775425813636
Epoch: 1, step: 59, Loss: 0.05640941555217161
Epoch: 2, step: 9, Loss: 0.39022061559889054
Epoch: 2, step: 19, Loss: 0.17647448338960348
Epoch: 2, step: 29, Loss: 0.11509426708879142
Epoch: 2, step: 39, Loss: 0.08288267331245618
Epoch: 2, step: 49, Loss: 0.057459072190888076
Epoch: 2, step: 59, Loss: 0.04983583143201925
Epoch: 3, step: 9, Loss: 0.33745402759975857
Epoch: 3, step: 19, Loss: 0.1468663215637207
Epoch: 3, step: 29, Loss: 0.11975921433547447
Epoch: 3, step: 39, Loss: 0.085497104204

In [64]:
import time
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def evaluation(model, data, criterion, device):
	model.eval()
	start = time.time()
	bleu_score = 0
	running_loss = 0
	total_samples = 0  # Keep track of total samples for averaging BLEU

	for i, batch in enumerate(data):
		src = batch['src'].to(device)
		tgt = batch['tgt'].to(device)
		with torch.no_grad():
			with torch.cuda.amp.autocast():  # Assuming you're using CUDA
				output, _ = model(src, tgt[:, 1:-1])
				output = output.reshape(-1, output.size(-1))
				loss = criterion(output, tgt[:, 1:].contiguous().view(-1))
			output = output.argmax(dim=-1)
			output = output.view(src.size(0), -1)
			# Calculate BLEU for each sentence and accumulate
			for ref, pred in zip(tgt[:, 1:], output):
				bleu_score += sentence_bleu([ref.cpu().numpy().tolist()], pred.cpu().numpy().tolist(), smoothing_function=SmoothingFunction().method4)
			running_loss += loss.item()
			total_samples += src.size(0)

	end = time.time()
	avg_bleu_score = bleu_score / total_samples  # Average BLEU over all samples
	print(f'Time: {end - start}, Loss: {running_loss / len(data)}, BLEU: {avg_bleu_score}')

In [65]:
evaluation(model, valid_loader, critertion, device)

Time: 1.6103034019470215, Loss: 2.4693203270435333, BLEU: 0.6203853913719672


In [66]:
def generate(model, sentence, tokenizer, device):
	model.eval()
	sentence = tokenizer(sentence, return_tensors='pt', padding='max_length', truncation=True, max_length=128)
	with torch.no_grad():
		_ , generated_token = model(sentence['input_ids'].to(device), tgt=None)
	return generated_token 

In [67]:
    xinchao = generate(model, "Hello", tokenizer, device)

In [68]:
tokenizer.decode(xinchao.squeeze(0))

'[CLS] Nhưng một là tôi, một một.n. [PAD] [PAD] [SEP]n [PAD] [PAD] [PAD]y, chúng tôi, tôi có một, ta là tôi là là có một có một chúng chúng tôi. [SEP]y chúng một chúng tôi.n và một là có tôi là có tôi là tôi là có một có thể'

In [69]:
tokenizer.decode(generate(model, "Even about seemingly personal and visceral things like who you &apos;re attracted to , you will start aping the beliefs of the people around you without even realizing that that &apos;s what you &apos;re doing .", tokenizer, device).squeeze(0))

'[CLS] Nhưng chúng chúng là chúng một, một chúng là tôi có thể, và chúng là có một có thể, một một có một một, chúng tôi, và tôi là chúng là là là chúng là có ta một, ta có thể một có là một, ta một là là là chúng chúng một, chúng'

In [70]:
tokenizer.decode(generate(model, "I love you", tokenizer, device).squeeze(0))

'[CLS] Chúng tôi có thể là chúng một chúng là là chúng chúng là chúng là là một, và chúng chúng chúng ta là một có thể, chúng là một, chúng chúng tôi có một một một là là tôi có tôi một một. [PAD]y, tôi có thể là tôi một. [PAD]n của bạn,'

In [71]:
tokenizer.decode(generate(model, "They had 348 different kinds of jam .", tokenizer, device).squeeze(0))

'[CLS] Nhưng một chúng là là là có một chúng tôi, tôi một.n. [PAD]y, một, ta có chúng chúng ta một một là có ta là là một, chúng ta có tôi một là chúng chúng là một chúng ta, và là tôi là chúng một chúng tôi là chúng tôi. ; [SEP]'

In [72]:
torch.save(model.state_dict(), '/kaggle/working/seq2seq3.pth')