In [61]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import torch.nn.functional as F
import unicodedata
import io
import json
import re
from google.colab import files


In [62]:
# Download the training set.
!wget https://raw.githubusercontent.com/nitinpunjabi/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt

--2023-05-02 17:49:02--  https://raw.githubusercontent.com/nitinpunjabi/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5518306 (5.3M) [text/plain]
Saving to: ‘hun_eng_pairs_train.txt.1’


2023-05-02 17:49:03 (68.3 MB/s) - ‘hun_eng_pairs_train.txt.1’ saved [5518306/5518306]



In [63]:
with open('hun_eng_pairs_train.txt') as file:
  train = [line.rstrip() for line in file]

In [64]:
train[:3],len(train)

(["Teszek rá, mit mondasz!<sep>I don't care what you say.",
  'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.<sep>We need more people like you on our team.',
  'Vigyázz a gyerekeimre!<sep>Take care of my children.'],
 88647)

In [65]:
# Separate the input (Hungarian) and target (English) sentences into separate lists.
SEPARATOR = '<sep>'
train_input, train_target = map(list, zip(*[pair.split(SEPARATOR) for pair in train]))

In [66]:
print(train_input[:3])
print(train_target[:3])

['Teszek rá, mit mondasz!', 'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.', 'Vigyázz a gyerekeimre!']
["I don't care what you say.", 'We need more people like you on our team.', 'Take care of my children.']


In [67]:
print("\u00E1", "\u0061\u0301")

á á


In [68]:
import unicodedata
import re
# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [69]:
def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [70]:
# Preprocess both the source and target sentences.
train_preprocessed_input = [preprocess_sentence(s) for s in train_input]
train_preprocessed_target = [preprocess_sentence(s) for s in train_target]

In [71]:
train_preprocessed_input[:3],train_preprocessed_target[:3],

(['Teszek ra , mit mondasz !',
  'Tobb olyan ember kell nekunk a csapatba , mint amilyen te vagy .',
  'Vigyazz a gyerekeimre !'],
 ["I don't care what you say .",
  'We need more people like you on our team .',
  'Take care of my children .'])

In [72]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [73]:
train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)
train_tagged_preprocessed_target[0]

"<sos> I don't care what you say . <eos>"

In [74]:
np.array(train_preprocessed_input).shape

(88647,)

In [75]:
def tokenise(line):
	ans=[]
	punc = [',', '.', '"', "'", '/', '*', ',', '?', '!', '-', '\n', '“', '”', '_', '&', '\ufeff', '&', ';', ":",'#','$','%','&','(',')','*','+','-','/',':',';','=','@',',','[\\]','^','_','`{|}~','\t']
	for el in line:
		if el in punc:
			line = line.replace(el, "")
	
	line=line.lower()
	return line.split()
	raise NotImplementedError

In [76]:
inp=np.array
a=tokenise(train_preprocessed_target[1])
a=np.array([a])
a.reshape(9,1)
a[0]

array(['we', 'need', 'more', 'people', 'like', 'you', 'on', 'our', 'team'],
      dtype='<U6')

In [77]:
np.array([1,2,3]).shape

(3,)

In [78]:
sent=np.array([tokenise(train_preprocessed_input[0])])
sent=np.append(sent,'<pad>')
len(sent[0])

6

In [79]:
input=np.array([tokenise(train_preprocessed_input[0])])
pad=np.full((50-len(input[0])),'<pad>')
sent=np.concatenate((input.squeeze(0),pad),axis=0)

In [80]:
train_input=train_preprocessed_input[:1000]
train_target=train_tagged_preprocessed_target[:1000]
test_input=train_preprocessed_input[1000:1200]
test_target=train_tagged_preprocessed_target[1000:1200]

In [81]:
de_vocab=set()
for line in train_input:
  sent=np.array([tokenise(line)])
  for x in sent[0]:
    de_vocab.add(x)

In [82]:
en_vocab=set()
for line in train_target:
  sent=np.array([tokenise(line)])
  for x in sent[0]:
    en_vocab.add(x)

In [83]:
!pip install ordered-set
from ordered_set import OrderedSet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [84]:
vg=list(de_vocab)
vg.insert(0,'<pad>')
de_vocab=OrderedSet(vg)

ve=list(en_vocab)
ve.insert(0,'<pad>')
en_vocab=OrderedSet(ve)

In [85]:
de_word2ix = {word: i for i, word in enumerate(de_vocab)}
de_ix2word = {i: word for i, word in enumerate(de_vocab)}
en_word2ix = {word: i for i, word in enumerate(en_vocab)}
en_ix2word = {i: word for i, word in enumerate(en_vocab)}

In [86]:
def tensor(list):
  return torch.from_numpy(np.array(list))

In [87]:
def de_generate_index(line):
  new_line=[]
  sent=tokenise(line)
  for x in sent:
   new_line.append(de_word2ix[x])
  return tensor(new_line)

In [88]:
def en_generate_index(line):
  new_line=[]
  sent=tokenise(line)
  for x in sent:
   new_line.append(en_word2ix[x])
  return tensor(new_line)

In [89]:
len(train_input),len(train_target)

(1000, 1000)

In [90]:
input=train_input
target=train_target

In [91]:
de_generate_index(input[0]),en_generate_index(target[0])

(tensor([1841, 1055,  951, 1527]),
 tensor([ 823,  148, 1086,  749,  458,   56, 1081,  269]))

In [92]:
def longest_seq(corpus):
  c=len(tokenise(corpus[0]))
  for sent in corpus:
    line=tokenise(sent)
    if c<len(line):c=len(line)
  return c
lg=max(longest_seq(input),longest_seq(target))
lg

22

In [93]:
def pad(line,longest_seq_length):
  pad=torch.zeros((longest_seq_length-len(line)))
  return torch.cat((line,pad))

In [94]:
a=list(tokenise(target[0]))
a.append('p')
a

['<sos>', 'i', 'dont', 'care', 'what', 'you', 'say', '<eos>', 'p']

In [95]:
def pad_sent(line):
  a=list(tokenise(line))
  for i in range(lg-len(a)):
    a.append('<pad>')
  return a

pad_sent(target[0])

['<sos>',
 'i',
 'dont',
 'care',
 'what',
 'you',
 'say',
 '<eos>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [96]:
def de_index(batch,batch_size):
  if(batch_size==1):return pad(de_generate_index(batch),lg).unsqueeze(1).type(torch.LongTensor)
  bs=pad(de_generate_index(batch[0]),lg)
  for i in range(1,batch_size):
    bs=torch.vstack((bs,pad(de_generate_index(batch[i]),lg)))
  bs=bs.permute(1,0)
  return bs.type(torch.LongTensor)

def en_index(batch,batch_size):
  if(batch_size==1):return pad(en_generate_index(batch),lg).unsqueeze(1).type(torch.LongTensor)
  bs=pad(en_generate_index(batch[0]),lg)
  for i in range(1,batch_size):
    bs=torch.vstack((bs,pad(en_generate_index(batch[i]),lg)))
  bs=bs.permute(1,0)
  return bs.type(torch.LongTensor)

In [97]:
de_index(input[:5],5).shape,de_index(input[0],1).shape

(torch.Size([22, 5]), torch.Size([22, 1]))

In [98]:
def initialize_embeddings(de_vocab_size,en_vocab_size ,embedding_dim):
    """
    Initialize source and target embeddings
    
    Args:
    vocab_size: size of the vocabulary
    embedding_dim: size of the embedding dimension
    
    Returns:
    source_embed: source embedding layer
    target_embed: target embedding layer
    """
    
    # initialize source embedding layer
    source_embed = nn.Embedding(de_vocab_size, embedding_dim)
    
    # initialize target embedding layer
    target_embed = nn.Embedding(en_vocab_size, embedding_dim)
    
    # return source and target embeddings
    return source_embed, target_embed

In [99]:
source_embed, target_embed = initialize_embeddings(de_vocab_size=len(de_vocab),en_vocab_size=len(en_vocab), embedding_dim=10)

In [100]:
target_embed(en_index(target[:5],5))

tensor([[[ 0.5236, -1.5899,  2.4777,  ..., -0.5857, -0.9114, -0.5862],
         [ 0.5236, -1.5899,  2.4777,  ..., -0.5857, -0.9114, -0.5862],
         [ 0.5236, -1.5899,  2.4777,  ..., -0.5857, -0.9114, -0.5862],
         [ 0.5236, -1.5899,  2.4777,  ..., -0.5857, -0.9114, -0.5862],
         [ 0.5236, -1.5899,  2.4777,  ..., -0.5857, -0.9114, -0.5862]],

        [[ 1.1399,  1.4239, -0.4426,  ...,  2.2362,  1.5027,  0.6351],
         [-0.6569,  0.6391,  0.5776,  ..., -0.4920, -0.5370, -0.7822],
         [-0.0512, -0.6869,  1.1995,  ...,  1.0416, -0.5122,  1.2388],
         [ 1.5079,  0.9764,  0.6924,  ...,  0.0760,  0.6000, -0.7063],
         [ 0.2427, -0.0870, -1.8900,  ..., -0.1658,  0.2836, -0.0509]],

        [[ 0.5204, -0.1418, -0.0350,  ...,  1.3545, -0.8827, -1.2806],
         [-0.1117, -0.7663, -1.3556,  ...,  0.7127, -1.6404, -0.0037],
         [ 0.3090, -0.9324, -0.4897,  ...,  1.7445,  0.6132, -0.5866],
         [ 0.2396, -0.8466,  1.5135,  ...,  1.8494,  0.3410,  1.5955],
  

In [101]:
input_seq = input[:5]
conv = nn.Conv1d(in_channels=10, out_channels=lg, kernel_size=3)
r=nn.ReLU()
pool = nn.MaxPool1d(3)
fc = nn.Linear(32, 5)
em=source_embed(de_index(input_seq,5)).permute(1,2,0)
p=pool(r(conv(em)))
out = F.pad(p, (4, 0, 0, 0, 0, 0))
p.shape,out.shape,em.shape

(torch.Size([5, 22, 6]), torch.Size([5, 22, 10]), torch.Size([5, 10, 22]))

In [102]:
em=torch.ones((22,5,10))
conv = nn.Conv1d(in_channels=10, out_channels=22, kernel_size=3)
pool=nn.MaxPool1d(3)
pol=pool(conv(em.permute(1,2,0)))
F.pad(pol, (4, 0, 0, 0, 0, 0)).shape

torch.Size([5, 22, 10])

In [103]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size,num_layers,encoder_dropout):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers=num_layers
        self.lstm = nn.LSTM(input_size, hidden_size,num_layers, bidirectional=True)
        self.dropout=nn.Dropout(encoder_dropout)
        self.lin_hid=nn.Linear(hidden_size*2,hidden_size)
        self.lin_cell=nn.Linear(hidden_size*2,hidden_size)
        self.conv = nn.Conv1d(in_channels=input_size, out_channels=lg, kernel_size=3)
        self.tanh=nn.Tanh()
        self.pool = nn.MaxPool1d(3)
        self.fc = nn.Linear(32, 5)
    def forward(self, input_seq,batch_size):
        #embedded=seq_len,batchsize,embeddingsize
        embedded = self.dropout(source_embed(de_index(input_seq,batch_size))).permute(1,2,0)#22,5,10-->5,10,22
        pool_out=self.pool(self.tanh(self.conv(embedded)))
        #conv(embedded)=5,22,20
        #pool=5,22,6
        embedded= F.pad(pool_out, (4, 0, 0, 0, 0, 0)).permute(1,0,2)#5,22,10-->22,5,10
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.lin_hid(torch.cat((hidden[0:1],hidden[1:2]), dim=2))
        cell = self.lin_cell(torch.cat((cell[0:1],cell[1:2]), dim=2))
        return output, hidden, embedded
        #output=seq_len,batchsize,2*hiddensize
        #hidden=2*numlayers,batchsizze,hiddensize=128
        #cell=2*numlayers,batchsizze,hiddensize=128

In [104]:
# Create an instance of the Encoder model with the pre-trained embedding matrix
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
input_seq = input[:5] # example input sequence of shape (batch_size, sequence_length)
encoder_output, encoder_hidden, encoder_cell = encoder(input_seq,5)
encoder_output.shape,encoder_hidden.shape,encoder_cell.shape

(torch.Size([22, 5, 256]), torch.Size([1, 5, 128]), torch.Size([22, 5, 10]))

In [114]:
class Decoder(nn.Module):
    def __init__(self,input_size,hidden_size, output_size, num_layers,decoder_dropout):
        super(Decoder, self).__init__()

        self.num_layers=num_layers
        self.lstm = nn.LSTM(hidden_size*2+input_size, hidden_size, num_layers, batch_first=False)
        
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.tanh=nn.Tanh()
        self.softmax=nn.Softmax(dim=0)
        self.energy=nn.Linear(hidden_size*3,1)#hidden_size*2+hidden_size
        self.dropout=nn.Dropout(decoder_dropout)
    def forward(self, input_seq, encoder_output,encoder_hidden,encoder_cell):
        #input_seq:shape=batchsize
        #input_seq.unsqeeze(0):shape=1,batchsize
        embedded = self.dropout(target_embed(input_seq.unsqueeze(0)))#1,batchsize,embeddingsize=10
        attn_hidden = encoder_hidden.repeat(lg,1,1)
        energy = self.tanh(self.energy(torch.cat((attn_hidden, encoder_output),dim=2)))
        attention= self.softmax(energy)
        attention=attention.permute(1,2,0)
        #torch.bmm(attention,encoder_output.permute(1,0,2))=5,1,31*5,31,256=5,1,256
        context=torch.bmm(attention,encoder_output.permute(1,0,2)).permute(1,0,2)
        new_input=torch.cat((context,embedded),dim=2)

        output, (hidden, cell) = self.lstm(new_input, (encoder_hidden, encoder_cell))

        # Pass the LSTM output through the output layer to get the output
        pred = self.output_layer(output)
        pred.squeeze(0)
        return pred,hidden,cell
        #prde=output=1,en_vocabsize
        #hidden=num_layers,hiddensize
        #cell=num_layers,hiddensize

In [118]:
class Decoder(nn.Module):
    def __init__(self,input_size,hidden_size, output_size, num_layers,decoder_dropout):
        super(Decoder, self).__init__()

        self.num_layers=num_layers
        self.lstm = nn.LSTM(hidden_size*2+input_size, hidden_size, num_layers, batch_first=False)
        
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.relu=nn.ReLU()
        self.softmax=nn.Softmax(dim=0)
        self.energy=nn.Linear(hidden_size*3,1)#hidden_size*2+hidden_size
        self.dropout=nn.Dropout(decoder_dropout)
    def forward(self, input_seq, encoder_output,encoder_hidden,encoder_cell):
        #input_seq:shape=batchsize
        #input_seq.unsqeeze(0):shape=1,batchsize
        embedded = self.dropout(target_embed(input_seq.unsqueeze(0)))#1,batchsize,embeddingsize=10
        attn_hidden = encoder_hidden.repeat(lg,1,1)
        energy = self.relu(self.energy(torch.cat((attn_hidden, encoder_output),dim=2)))
        attention= self.softmax(energy)
        attention=attention.permute(1,2,0)
        #torch.bmm(attention,encoder_output.permute(1,0,2))=5,1,31*5,31,256=5,1,256
        context=torch.bmm(attention,encoder_output.permute(1,0,2)).permute(1,0,2)
        new_input=torch.cat((context,embedded),dim=2)

        output, (hidden, cell) = self.lstm(new_input, (encoder_hidden, encoder_cell))

        # Pass the LSTM output through the output layer to get the output
        pred = self.output_layer(output)
        pred.squeeze(0)
        return pred,hidden,cell
        #prde=output=1,en_vocabsize
        #hidden=num_layers,hiddensize
        #cell=num_layers,hiddensize

In [119]:
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
input_seq = en_index(target[:5],5)[0] # replace with your input
 # use the final hidden state from the encoder
decoder_output, decoder_hidden,decoder_cell = decoder(input_seq,encoder_output, encoder_hidden,encoder_cell)
decoder_output.shape,decoder_hidden.shape,decoder_cell.shape

RuntimeError: ignored

In [None]:
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
input_seq = input[5] # example input sequence of shape (batch_size, sequence_length)
encoder_output, encoder_hidden, encoder_cell = encoder(input_seq,1)
encoder_output.shape,encoder_hidden.shape,encoder_cell.shape
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
trg_seq = en_index(target[5],1)[0] # replace with your input
 # use the final hidden state from the encoder
decoder_output, decoder_hidden,decoder_cell = decoder(trg_seq,encoder_output, encoder_hidden,encoder_cell)
decoder_output.shape,decoder_hidden.shape,decoder_cell.shape

In [None]:
en_ix2word[torch.IntTensor.item(torch.argmax(decoder_output))]

In [None]:
trg_seq.shape

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq,batch_size):
        target_len = lg
        target_vocab_size = len(en_vocab)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size)
        encoder_output, encoder_hidden, encoder_cell = self.encoder(input_seq,batch_size)
        decoder_input = target_seq[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(decoder_input,encoder_output, encoder_hidden, encoder_cell)
            outputs[t] = output
            decoder_input = target_seq[t]
        return outputs#shape=seq_len,batchsize,en_vocabsize

In [None]:
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
model=Seq2Seq(encoder,decoder)

out=model(input[:5],en_index(target[:5],5),batch_size=5)
out.shape

In [None]:
torch.argmax(out[1][0])

In [None]:
num_epochs= 3
learning_rate = 0.001
batch_size = 5

In [None]:
encoder = Encoder(input_size=10, hidden_size=1024,num_layers=1,encoder_dropout=0.5)
decoder = Decoder(input_size=10, hidden_size=1024, output_size=len(en_vocab), num_layers=1,decoder_dropout=0.5)
model=Seq2Seq(encoder,decoder)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
loss_fn = nn.CrossEntropyLoss()

In [None]:
def create_target_tensor(target_sentence,batch_size):
    
    target_tensor = torch.zeros(batch_size,len(en_vocab))
    for n in range(batch_size):
      target=en_index(target_sentence[n],1)
      for i,word in enumerate(target):
        target_tensor[n][torch.IntTensor.item(word)]=1
    target_tensor=target_tensor.type(torch.long)
    return target_tensor

In [None]:
create_target_tensor(target[:5],5).shape

In [None]:
loss_fn(out.permute(1,0,2),create_target_tensor(target[:5],5))

In [None]:
# Driving training loop
BATCH=30
model.train()
#for epoch in tqdm(range(1,num_epochs+1)):
for epoch in range(3):
    total_epoch_loss=0
    # Iterate through train dataset
    for i in range(0,961,BATCH):
        # 1. forward pass the inputs through the model
        output =model(input[i:i+BATCH],en_index(target[i:i+BATCH],BATCH),batch_size=BATCH).permute(1,0,2)
        trg=create_target_tensor(target[i:i+BATCH],BATCH)

        optimizer.zero_grad()
        
        loss = loss_fn(output,trg)                                 
        total_epoch_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        # 4. update the parameters
        optimizer.step()
        print(i)

    print(f"Epoch: [{epoch}/{num_epochs}] Epoch Loss: {total_epoch_loss}")

In [None]:
encoder_output,encoder_hidden, encoder_cell = model.encoder(input[998],1)
output, hidden, cell = model.decoder(torch.tensor(7177).unsqueeze(0),encoder_output,encoder_hidden,encoder_cell)

In [None]:
en_ix2word[torch.IntTensor.item(torch.argmax(output[0][0]))]

In [None]:
en_word2ix['<sos>']

In [None]:
test_input[0]

In [None]:
def translate_sentence(model, sentence,max_length=lg):


    with torch.no_grad():
        encoder_output,encoder_hidden, encoder_cell = model.encoder(sentence,1)
    out=[89]
    for i in range(max_length):
        previous_word = torch.tensor(out[-1])

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word.unsqueeze(0),encoder_output,encoder_hidden,encoder_cell)
            best_guess=torch.IntTensor.item(torch.argmax(output[0][0]))
        out.append(best_guess)

        if torch.IntTensor.item(torch.argmax(output[0][0])) == en_word2ix['<eos>']:
            break

    translated_sen = [en_ix2word[idx] for idx in out]

    return translated_sen[1:]

In [None]:
sent=input[9]
translate_sentence(model,sent)

In [None]:
a

In [None]:
for i in range(len(a)):
  a[i]='<pad>'
a,pad_sent(target[0])

In [None]:
bleu_score(a, pad_sent(target[989]))

In [None]:
from torchtext.data.metrics import bleu_score

In [None]:
def bleu(data, model, german, english, device):
    targets= []
    outputs= []

    for eg in data:
        src = vars(eg)['src']
        trg = var(eg)['trg']

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1] # eos removed

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)

In [None]:
'''
trg=trg.type(torch.LongTensor)
out=out.type(torch.LongTensor)
scores = torch.zeros((31,5,10612))  # shape: (batch_size=10, num_classes=5)
labels = torch.ones((31,5,10612),dtype=torch.long)  # shape: (batch_size=10)

# Define the cross-entropy loss function

# Compute the loss
loss_fn(scores, labels)
'''