In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import torch.nn.functional as F
import unicodedata
import io
import json
import re
from google.colab import files


In [None]:
# Download the training set.
!wget https://raw.githubusercontent.com/nitinpunjabi/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt

--2023-05-02 03:29:30--  https://raw.githubusercontent.com/nitinpunjabi/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5518306 (5.3M) [text/plain]
Saving to: ‘hun_eng_pairs_train.txt.4’


2023-05-02 03:29:32 (299 MB/s) - ‘hun_eng_pairs_train.txt.4’ saved [5518306/5518306]



In [None]:
with open('hun_eng_pairs_train.txt') as file:
  train = [line.rstrip() for line in file]

In [None]:
train[:3],len(train)

(["Teszek rá, mit mondasz!<sep>I don't care what you say.",
  'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.<sep>We need more people like you on our team.',
  'Vigyázz a gyerekeimre!<sep>Take care of my children.'],
 88647)

In [None]:
# Separate the input (Hungarian) and target (English) sentences into separate lists.
SEPARATOR = '<sep>'
train_input, train_target = map(list, zip(*[pair.split(SEPARATOR) for pair in train]))

In [None]:
print(train_input[:3])
print(train_target[:3])

['Teszek rá, mit mondasz!', 'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.', 'Vigyázz a gyerekeimre!']
["I don't care what you say.", 'We need more people like you on our team.', 'Take care of my children.']


In [None]:
print("\u00E1", "\u0061\u0301")

á á


In [None]:
import unicodedata
import re
# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [None]:
def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [None]:
# Preprocess both the source and target sentences.
train_preprocessed_input = [preprocess_sentence(s) for s in train_input]
train_preprocessed_target = [preprocess_sentence(s) for s in train_target]

In [None]:
train_preprocessed_input[:3],train_preprocessed_target[:3],

(['Teszek ra , mit mondasz !',
  'Tobb olyan ember kell nekunk a csapatba , mint amilyen te vagy .',
  'Vigyazz a gyerekeimre !'],
 ["I don't care what you say .",
  'We need more people like you on our team .',
  'Take care of my children .'])

In [None]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [None]:
train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)
train_tagged_preprocessed_target[0]

"<sos> I don't care what you say . <eos>"

In [None]:
np.array(train_preprocessed_input).shape

(88647,)

In [None]:
def tokenise(line):
	ans=[]
	punc = [',', '.', '"', "'", '/', '*', ',', '?', '!', '-', '\n', '“', '”', '_', '&', '\ufeff', '&', ';', ":",'#','$','%','&','(',')','*','+','-','/',':',';','=','@',',','[\\]','^','_','`{|}~','\t']
	for el in line:
		if el in punc:
			line = line.replace(el, "")
	
	line=line.lower()
	return line.split()
	raise NotImplementedError

In [None]:
inp=np.array
a=tokenise(train_preprocessed_target[1])
a=np.array([a])
a.reshape(9,1)
a[0]

array(['we', 'need', 'more', 'people', 'like', 'you', 'on', 'our', 'team'],
      dtype='<U6')

In [None]:
np.array([1,2,3]).shape

(3,)

In [None]:
sent=np.array([tokenise(train_preprocessed_input[0])])
sent=np.append(sent,'<pad>')
len(sent[0])

6

In [None]:
input=np.array([tokenise(train_preprocessed_input[0])])
pad=np.full((50-len(input[0])),'<pad>')
sent=np.concatenate((input.squeeze(0),pad),axis=0)

In [None]:
train_input=train_preprocessed_input[:1000]
train_target=train_tagged_preprocessed_target[:1000]
test_input=train_preprocessed_input[1000:1200]
test_target=train_tagged_preprocessed_target[1000:1200]

In [None]:
de_vocab=set()
for line in train_input:
  sent=np.array([tokenise(line)])
  for x in sent[0]:
    de_vocab.add(x)

In [None]:
en_vocab=set()
for line in train_target:
  sent=np.array([tokenise(line)])
  for x in sent[0]:
    en_vocab.add(x)

In [None]:
!pip install ordered-set
from ordered_set import OrderedSet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
vg=list(de_vocab)
vg.insert(0,'<pad>')
de_vocab=OrderedSet(vg)

ve=list(en_vocab)
ve.insert(0,'<pad>')
en_vocab=OrderedSet(ve)

In [None]:
de_word2ix = {word: i for i, word in enumerate(de_vocab)}
de_ix2word = {i: word for i, word in enumerate(de_vocab)}
en_word2ix = {word: i for i, word in enumerate(en_vocab)}
en_ix2word = {i: word for i, word in enumerate(en_vocab)}

In [None]:
def tensor(list):
  return torch.from_numpy(np.array(list))

In [None]:
def de_generate_index(line):
  new_line=[]
  sent=tokenise(line)
  for x in sent:
   new_line.append(de_word2ix[x])
  return tensor(new_line)

In [None]:
def en_generate_index(line):
  new_line=[]
  sent=tokenise(line)
  for x in sent:
   new_line.append(en_word2ix[x])
  return tensor(new_line)

In [None]:
len(train_input),len(train_target)

(1000, 1000)

In [None]:
input=train_input
target=train_target

In [None]:
de_generate_index(input[0]),en_generate_index(target[0])

(tensor([1032,  191, 1632,  200]),
 tensor([ 582,  145, 1131,  654, 1252,  994,  184,  779]))

In [None]:
def longest_seq(corpus):
  c=len(tokenise(corpus[0]))
  for sent in corpus:
    line=tokenise(sent)
    if c<len(line):c=len(line)
  return c
lg=max(longest_seq(input),longest_seq(target))
lg

22

In [None]:
def pad(line,longest_seq_length):
  pad=torch.zeros((longest_seq_length-len(line)))
  return torch.cat((line,pad))

In [None]:
a=list(tokenise(target[0]))
a.append('p')
a

['<sos>', 'i', 'dont', 'care', 'what', 'you', 'say', '<eos>', 'p']

In [None]:
def pad_sent(line):
  a=list(tokenise(line))
  for i in range(lg-len(a)):
    a.append('<pad>')
  return a

pad_sent(target[0])

['<sos>',
 'i',
 'dont',
 'care',
 'what',
 'you',
 'say',
 '<eos>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [None]:
def de_index(batch,batch_size):
  if(batch_size==1):return pad(de_generate_index(batch),lg).unsqueeze(1).type(torch.LongTensor)
  bs=pad(de_generate_index(batch[0]),lg)
  for i in range(1,batch_size):
    bs=torch.vstack((bs,pad(de_generate_index(batch[i]),lg)))
  bs=bs.permute(1,0)
  return bs.type(torch.LongTensor)

def en_index(batch,batch_size):
  if(batch_size==1):return pad(en_generate_index(batch),lg).unsqueeze(1).type(torch.LongTensor)
  bs=pad(en_generate_index(batch[0]),lg)
  for i in range(1,batch_size):
    bs=torch.vstack((bs,pad(en_generate_index(batch[i]),lg)))
  bs=bs.permute(1,0)
  return bs.type(torch.LongTensor)

In [None]:
de_index(input[:5],5).shape,de_index(input[0],1).shape

(torch.Size([22, 5]), torch.Size([22, 1]))

In [None]:
def initialize_embeddings(de_vocab_size,en_vocab_size ,embedding_dim):
    """
    Initialize source and target embeddings
    
    Args:
    vocab_size: size of the vocabulary
    embedding_dim: size of the embedding dimension
    
    Returns:
    source_embed: source embedding layer
    target_embed: target embedding layer
    """
    
    # initialize source embedding layer
    source_embed = nn.Embedding(de_vocab_size, embedding_dim)
    
    # initialize target embedding layer
    target_embed = nn.Embedding(en_vocab_size, embedding_dim)
    
    # return source and target embeddings
    return source_embed, target_embed

In [None]:
source_embed, target_embed = initialize_embeddings(de_vocab_size=len(de_vocab),en_vocab_size=len(en_vocab), embedding_dim=10)

In [None]:
target_embed(en_index(target[:5],5))

tensor([[[-0.6389,  0.8574,  0.0611,  ...,  0.3284,  1.0514, -0.1113],
         [-0.6389,  0.8574,  0.0611,  ...,  0.3284,  1.0514, -0.1113],
         [-0.6389,  0.8574,  0.0611,  ...,  0.3284,  1.0514, -0.1113],
         [-0.6389,  0.8574,  0.0611,  ...,  0.3284,  1.0514, -0.1113],
         [-0.6389,  0.8574,  0.0611,  ...,  0.3284,  1.0514, -0.1113]],

        [[-0.9306, -0.2380, -0.7814,  ...,  1.4645, -0.8832, -0.1308],
         [ 1.1216,  0.1193,  0.3441,  ..., -1.6527, -0.2511,  0.8579],
         [ 0.4574,  1.1295,  0.1139,  ...,  0.3077, -0.4936,  0.4929],
         [ 0.7366, -0.7042,  0.7799,  ..., -1.7981, -0.0621, -0.9072],
         [ 0.5469,  0.0551,  0.4180,  ..., -0.0647, -1.4863,  0.3180]],

        [[ 0.3007,  0.7624,  0.6758,  ...,  0.3128, -0.5408, -3.0785],
         [-0.4240, -0.4789,  0.9006,  ..., -0.9406,  0.0590,  0.0713],
         [-0.5066, -0.3683,  0.1227,  ...,  1.8732, -1.1272, -1.2324],
         [ 0.5487, -1.1625,  0.1413,  ...,  0.2714,  0.4950, -0.0463],
  

In [None]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size,num_layers,encoder_dropout):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers=num_layers
        self.lstm = nn.LSTM(input_size, hidden_size,num_layers, bidirectional=True)
        self.dropout=nn.Dropout(encoder_dropout)
        self.lin_hid=nn.Linear(hidden_size*2,hidden_size)
        self.lin_cell=nn.Linear(hidden_size*2,hidden_size)
    def forward(self, input_seq,batch_size):
        #embedded=seq_len,batchsize,embeddingsize
        embedded = self.dropout(source_embed(de_index(input_seq,batch_size)))

        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.lin_hid(torch.cat((hidden[0:1],hidden[1:2]), dim=2))
        cell = self.lin_cell(torch.cat((cell[0:1],cell[1:2]), dim=2))
        return output, hidden, cell
        #output=seq_len,batchsize,2*hiddensize
        #hidden=2*numlayers,batchsizze,hiddensize=128
        #cell=2*numlayers,batchsizze,hiddensize=128

In [None]:
# Create an instance of the Encoder model with the pre-trained embedding matrix
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
input_seq = input[:5] # example input sequence of shape (batch_size, sequence_length)
encoder_output, encoder_hidden, encoder_cell = encoder(input_seq,5)
encoder_output.shape,encoder_hidden.shape,encoder_cell.shape

(torch.Size([22, 5, 256]), torch.Size([1, 5, 128]), torch.Size([1, 5, 128]))

In [None]:
class Decoder(nn.Module):
    def __init__(self,input_size,hidden_size, output_size, num_layers,decoder_dropout):
        super(Decoder, self).__init__()

        self.num_layers=num_layers
        self.lstm = nn.LSTM(hidden_size*2+input_size, hidden_size, num_layers, batch_first=False)
        
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.tanh=nn.Tanh()
        self.softmax=nn.Softmax(dim=0)
        self.energy=nn.Linear(hidden_size*3,1)#hidden_size*2+hidden_size
        self.dropout=nn.Dropout(decoder_dropout)
    def forward(self, input_seq, encoder_output,encoder_hidden,encoder_cell):
        #input_seq:shape=batchsize
        #input_seq.unsqeeze(0):shape=1,batchsize
        embedded = self.dropout(target_embed(input_seq.unsqueeze(0)))#1,batchsize,embeddingsize=10
        attn_hidden = encoder_hidden.repeat(lg,1,1)
        energy = self.tanh(self.energy(torch.cat((attn_hidden, encoder_output),dim=2)))
        attention= self.softmax(energy)
        attention=attention.permute(1,2,0)
        #torch.bmm(attention,encoder_output.permute(1,0,2))=5,1,31*5,31,256=5,1,256
        context=torch.bmm(attention,encoder_output.permute(1,0,2)).permute(1,0,2)
        new_input=torch.cat((context,embedded),dim=2)

        output, (hidden, cell) = self.lstm(new_input, (encoder_hidden, encoder_cell))

        # Pass the LSTM output through the output layer to get the output
        pred = self.output_layer(output)
        pred.squeeze(0)
        return pred,hidden,cell
        #prde=output=1,en_vocabsize
        #hidden=num_layers,hiddensize
        #cell=num_layers,hiddensize

In [None]:
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
input_seq = en_index(target[:5],5)[0] # replace with your input
 # use the final hidden state from the encoder
decoder_output, decoder_hidden,decoder_cell = decoder(input_seq,encoder_output, encoder_hidden,encoder_cell)
decoder_output.shape,decoder_hidden.shape,decoder_cell.shape

(torch.Size([1, 5, 1291]), torch.Size([1, 5, 128]), torch.Size([1, 5, 128]))

In [None]:
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
input_seq = input[5] # example input sequence of shape (batch_size, sequence_length)
encoder_output, encoder_hidden, encoder_cell = encoder(input_seq,1)
encoder_output.shape,encoder_hidden.shape,encoder_cell.shape
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
trg_seq = en_index(target[5],1)[0] # replace with your input
 # use the final hidden state from the encoder
decoder_output, decoder_hidden,decoder_cell = decoder(trg_seq,encoder_output, encoder_hidden,encoder_cell)
decoder_output.shape,decoder_hidden.shape,decoder_cell.shape

(torch.Size([1, 1, 1291]), torch.Size([1, 1, 128]), torch.Size([1, 1, 128]))

In [None]:
en_ix2word[torch.IntTensor.item(torch.argmax(decoder_output))]

'delicious'

In [None]:
trg_seq.shape

torch.Size([1])

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq,batch_size):
        target_len = lg
        target_vocab_size = len(en_vocab)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size)
        encoder_output, encoder_hidden, encoder_cell = self.encoder(input_seq,batch_size)
        decoder_input = target_seq[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(decoder_input,encoder_output, encoder_hidden, encoder_cell)
            outputs[t] = output
            decoder_input = target_seq[t]
        return outputs#shape=seq_len,batchsize,en_vocabsize

In [None]:
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
model=Seq2Seq(encoder,decoder)

out=model(input[:5],en_index(target[:5],5),batch_size=5)
out.shape

torch.Size([22, 5, 1291])

In [None]:
torch.argmax(out[1][0])

tensor(804)

In [None]:
num_epochs= 3
learning_rate = 0.001
batch_size = 5

In [None]:
encoder = Encoder(input_size=10, hidden_size=1024,num_layers=1,encoder_dropout=0.5)
decoder = Decoder(input_size=10, hidden_size=1024, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
model=Seq2Seq(encoder,decoder)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
loss_fn = nn.CrossEntropyLoss()

In [None]:
def create_target_tensor(target_sentence,batch_size):
    
    target_tensor = torch.zeros(batch_size,len(en_vocab))
    for n in range(batch_size):
      target=en_index(target_sentence[n],1)
      for i,word in enumerate(target):
        target_tensor[n][torch.IntTensor.item(word)]=1
    target_tensor=target_tensor.type(torch.long)
    return target_tensor

In [None]:
create_target_tensor(target[:5],5).shape

torch.Size([5, 1291])

In [None]:
loss_fn(out.permute(1,0,2),create_target_tensor(target[:5],5))

tensor(3.0907, grad_fn=<NllLoss2DBackward0>)

In [None]:
# Driving training loop
BATCH=30
model.train()
#for epoch in tqdm(range(1,num_epochs+1)):
for epoch in range(3):
    total_epoch_loss=0
    # Iterate through train dataset
    for i in range(0,961,BATCH):
        # 1. forward pass the inputs through the model
        output =model(input[i:i+BATCH],en_index(target[i:i+BATCH],BATCH),batch_size=BATCH).permute(1,0,2)
        trg=create_target_tensor(target[i:i+BATCH],BATCH)

        optimizer.zero_grad()
        
        loss = loss_fn(output,trg)                                 
        total_epoch_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        # 4. update the parameters
        optimizer.step()
        print(i)

    print(f"Epoch: [{epoch}/{num_epochs}] Epoch Loss: {total_epoch_loss}")

0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
Epoch: [0/3] Epoch Loss: 1.4875435717403889
0
30
60
90
120
150
180
210
240
270
300
330
360
390
420
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
Epoch: [1/3] Epoch Loss: 1.431684773415327
0
30
60
90
120
150
180
210
240
270
300
330
360
390
450
480
510
540
570
600
630
660
690
720
750
780
810
840
870
900
930
960
Epoch: [2/3] Epoch Loss: 1.4039060473442078


In [None]:
encoder_output,encoder_hidden, encoder_cell = model.encoder(input[998],1)
output, hidden, cell = model.decoder(torch.tensor(7177).unsqueeze(0),encoder_output,encoder_hidden,encoder_cell)

In [None]:
en_ix2word[torch.IntTensor.item(torch.argmax(output[0][0]))]

In [None]:
en_word2ix['<sos>']

In [None]:
test_input[0]

'A szerzodes ervenytelen , ha kenyszeritettek , hogy alairja .'

In [None]:
def translate_sentence(model, sentence,max_length=lg):


    with torch.no_grad():
        encoder_output,encoder_hidden, encoder_cell = model.encoder(sentence,1)
    out=[en_word2ix[tokenise(target[0])[0]]]
    for i in range(max_length):
        previous_word = torch.tensor(out[-1])

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word.unsqueeze(0),encoder_output,encoder_hidden,encoder_cell)
            best_guess=torch.IntTensor.item(torch.argmax(output[0][0]))
        out.append(best_guess)

        if torch.IntTensor.item(torch.argmax(output[0][0])) == en_word2ix['<eos>']:
            break

    translated_sen = [en_ix2word[idx] for idx in out]

    return translated_sen[1:]

In [None]:
tokenise(input[0])[0]

'teszek'

In [None]:
sent=input[0]
translate_sentence(model,sent)

['<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>',
 '<sos>']

In [None]:
a

In [None]:
for i in range(len(a)):
  a[i]='<pad>'
a,pad_sent(target[0])

In [None]:
bleu_score(a, pad_sent(target[989]))

In [None]:
from torchtext.data.metrics import bleu_score

In [None]:
def bleu(data, model, german, english, device):
    targets= []
    outputs= []

    for eg in data:
        src = vars(eg)['src']
        trg = var(eg)['trg']

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1] # eos removed

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

In [None]:
'''
trg=trg.type(torch.LongTensor)
out=out.type(torch.LongTensor)
scores = torch.zeros((31,5,10612))  # shape: (batch_size=10, num_classes=5)
labels = torch.ones((31,5,10612),dtype=torch.long)  # shape: (batch_size=10)

# Define the cross-entropy loss function

# Compute the loss
loss_fn(scores, labels)
'''