In [47]:
import numpy as np
import pandas as pd
import os
import shutil
import cv2
from tqdm.notebook import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext
import spacy

In [48]:
# Download the training set.
!wget https://raw.githubusercontent.com/nitinpunjabi/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt

--2023-05-02 14:50:34--  https://raw.githubusercontent.com/nitinpunjabi/nlp-demystified/main/datasets/hun_eng_pairs/hun_eng_pairs_train.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5518306 (5.3M) [text/plain]
Saving to: ‘hun_eng_pairs_train.txt.1’


2023-05-02 14:50:35 (66.7 MB/s) - ‘hun_eng_pairs_train.txt.1’ saved [5518306/5518306]



In [49]:
with open('hun_eng_pairs_train.txt') as file:
  train = [line.rstrip() for line in file]

In [50]:
train[:3],len(train)

(["Teszek rá, mit mondasz!<sep>I don't care what you say.",
  'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.<sep>We need more people like you on our team.',
  'Vigyázz a gyerekeimre!<sep>Take care of my children.'],
 88647)

In [51]:
# Separate the input (Hungarian) and target (English) sentences into separate lists.
SEPARATOR = '<sep>'
train_input, train_target = map(list, zip(*[pair.split(SEPARATOR) for pair in train]))

In [52]:
print(train_input[:3])
print(train_target[:3])

['Teszek rá, mit mondasz!', 'Több olyan ember kell nekünk a csapatba, mint amilyen te vagy.', 'Vigyázz a gyerekeimre!']
["I don't care what you say.", 'We need more people like you on our team.', 'Take care of my children.']


In [53]:
print("\u00E1", "\u0061\u0301")

á á


In [54]:
import unicodedata
import re
# Unicode normalization
def normalize_unicode(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

In [55]:
def preprocess_sentence(s):
  s = normalize_unicode(s)
  s = re.sub(r"([?.!,¿])", r" \1 ", s)
  s = re.sub(r'[" "]+', " ", s)
  s = s.strip()
  return s

In [56]:
# Preprocess both the source and target sentences.
train_preprocessed_input = [preprocess_sentence(s) for s in train_input]
train_preprocessed_target = [preprocess_sentence(s) for s in train_target]

In [57]:
train_preprocessed_input[:3],train_preprocessed_target[:3],

(['Teszek ra , mit mondasz !',
  'Tobb olyan ember kell nekunk a csapatba , mint amilyen te vagy .',
  'Vigyazz a gyerekeimre !'],
 ["I don't care what you say .",
  'We need more people like you on our team .',
  'Take care of my children .'])

In [58]:
def tag_target_sentences(sentences):
  tagged_sentences = map(lambda s: (' ').join(['<sos>', s, '<eos>']), sentences)
  return list(tagged_sentences)

In [59]:
train_tagged_preprocessed_target = tag_target_sentences(train_preprocessed_target)
train_tagged_preprocessed_target[0]

"<sos> I don't care what you say . <eos>"

In [60]:
np.array(train_preprocessed_input).shape

(88647,)

In [61]:
def tokenise(line):
	ans=[]
	punc = [',', '.', '"', "'", '/', '*', ',', '?', '!', '-', '\n', '“', '”', '_', '&', '\ufeff', '&', ';', ":",'#','$','%','&','(',')','*','+','-','/',':',';','=','@',',','[\\]','^','_','`{|}~','\t']
	for el in line:
		if el in punc:
			line = line.replace(el, "")
	
	line=line.lower()
	return line.split()
	raise NotImplementedError

In [62]:
inp=np.array
a=tokenise(train_preprocessed_target[1])
a=np.array([a])
a.reshape(9,1)
a[0]

array(['we', 'need', 'more', 'people', 'like', 'you', 'on', 'our', 'team'],
      dtype='<U6')

In [63]:
np.array([1,2,3]).shape

(3,)

In [64]:
sent=np.array([tokenise(train_preprocessed_input[0])])
sent=np.append(sent,'<pad>')
len(sent[0])

6

In [65]:
input=np.array([tokenise(train_preprocessed_input[0])])
pad=np.full((50-len(input[0])),'<pad>')
sent=np.concatenate((input.squeeze(0),pad),axis=0)

In [66]:
de_vocab=set()
for line in train_preprocessed_input:
  sent=np.array([tokenise(line)])
  for x in sent[0]:
    de_vocab.add(x)

In [67]:
en_vocab=set()
for line in train_tagged_preprocessed_target:
  sent=np.array([tokenise(line)])
  for x in sent[0]:
    en_vocab.add(x)

In [68]:
!pip install ordered-set
from ordered_set import OrderedSet

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [69]:
vg=list(de_vocab)
vg.insert(0,'<pad>')
de_vocab=OrderedSet(vg)

ve=list(en_vocab)
ve.insert(0,'<pad>')
en_vocab=OrderedSet(ve)

In [70]:
de_word2ix = {word: i for i, word in enumerate(de_vocab)}
de_ix2word = {i: word for i, word in enumerate(de_vocab)}
en_word2ix = {word: i for i, word in enumerate(en_vocab)}
en_ix2word = {i: word for i, word in enumerate(en_vocab)}

In [71]:
def tensor(list):
  return torch.from_numpy(np.array(list))

In [72]:
def de_generate_index(line):
  new_line=[]
  sent=tokenise(line)
  for x in sent:
   new_line.append(de_word2ix[x])
  return tensor(new_line)

In [73]:
def en_generate_index(line):
  new_line=[]
  sent=tokenise(line)
  for x in sent:
   new_line.append(en_word2ix[x])
  return tensor(new_line)

In [74]:
input=train_preprocessed_input
target=train_tagged_preprocessed_target

In [75]:
de_generate_index(input[0]),en_generate_index(target[0])

(tensor([25186, 28191,  1284, 15678]),
 tensor([5641, 8688, 2164, 2229, 6049, 6620, 2671, 1527]))

In [76]:
def longest_seq(corpus):
  c=len(tokenise(corpus[0]))
  for sent in corpus:
    line=tokenise(sent)
    if c<len(line):c=len(line)
  return c
longest_seq(input),longest_seq(target)

(30, 31)

In [77]:
def pad(line,longest_seq_length):
  pad=torch.zeros((longest_seq_length-len(line)))
  return torch.cat((line,pad))

In [78]:
def de_index(batch,batch_size):
  if(batch_size==1):return pad(de_generate_index(batch),31).type(torch.LongTensor)
  bs=pad(de_generate_index(batch[0]),31)
  for i in range(1,batch_size):
    bs=torch.vstack((bs,pad(de_generate_index(batch[i]),31)))
  bs=bs.permute(1,0)
  return bs.type(torch.LongTensor)

def en_index(batch,batch_size):
  if(batch_size==1):return pad(en_generate_index(batch),31).type(torch.LongTensor)
  bs=pad(en_generate_index(batch[0]),31)
  for i in range(1,batch_size):
    bs=torch.vstack((bs,pad(en_generate_index(batch[i]),31)))
  bs=bs.permute(1,0)
  return bs.type(torch.LongTensor)

In [79]:
def initialize_embeddings(de_vocab_size,en_vocab_size ,embedding_dim):
    """
    Initialize source and target embeddings
    
    Args:
    vocab_size: size of the vocabulary
    embedding_dim: size of the embedding dimension
    
    Returns:
    source_embed: source embedding layer
    target_embed: target embedding layer
    """
    
    # initialize source embedding layer
    source_embed = nn.Embedding(de_vocab_size, embedding_dim)
    
    # initialize target embedding layer
    target_embed = nn.Embedding(en_vocab_size, embedding_dim)
    
    # return source and target embeddings
    return source_embed, target_embed

In [80]:
source_embed, target_embed = initialize_embeddings(de_vocab_size=len(de_vocab),en_vocab_size=len(en_vocab), embedding_dim=10)

In [81]:
target_embed(en_index(target[:5],5))

tensor([[[-0.0609, -1.3049, -0.3995,  ..., -1.2583,  2.4773,  1.6223],
         [-0.0609, -1.3049, -0.3995,  ..., -1.2583,  2.4773,  1.6223],
         [-0.0609, -1.3049, -0.3995,  ..., -1.2583,  2.4773,  1.6223],
         [-0.0609, -1.3049, -0.3995,  ..., -1.2583,  2.4773,  1.6223],
         [-0.0609, -1.3049, -0.3995,  ..., -1.2583,  2.4773,  1.6223]],

        [[ 1.7515,  0.7469,  2.5176,  ...,  0.1576,  2.4499,  1.3010],
         [ 0.6884,  3.2700,  1.0205,  ..., -0.3058, -0.3924,  0.3207],
         [-0.1351,  0.9520,  0.0486,  ..., -0.0538,  0.9967,  1.3971],
         [-0.1229, -2.0459,  0.8047,  ..., -0.4054,  2.6172,  0.6063],
         [ 0.7123,  0.7585, -1.3208,  ..., -1.3424,  0.4047,  0.1186]],

        [[ 1.3198, -0.7766, -1.6296,  ..., -1.9968, -0.9073, -0.3267],
         [ 0.0652, -1.5578,  1.2297,  ...,  0.6681, -0.9276, -0.8125],
         [ 0.9314,  2.6078,  0.2014,  ..., -1.1021,  0.8485, -0.9834],
         [-0.5093,  0.9105, -0.6157,  ..., -0.6821,  0.5606,  1.2277],
  

In [82]:
class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size,num_layers,encoder_dropout):
        super(Encoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers=num_layers
        self.lstm = nn.LSTM(input_size, hidden_size,num_layers, bidirectional=True)
        self.dropout=nn.Dropout(encoder_dropout)
    def forward(self, input_seq,batch_size):
        embedded = self.dropout(source_embed(de_index(input_seq,batch_size)))
        output, (hidden, cell) = self.lstm(embedded)
        return output, hidden, cell
        #output=seq_len,batchsize,2*hiddensize
        #hidden=2*numlayers,batchsizze,hiddensize=128
        #cell=2*numlayers,batchsizze,hiddensize=128

In [83]:
# Create an instance of the Encoder model with the pre-trained embedding matrix
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
input_seq = input[:5] # example input sequence of shape (batch_size, sequence_length)
encoder_output, encoder_hidden, encoder_cell = encoder(input_seq,5)
encoder_output.shape,encoder_hidden.shape,encoder_cell.shape

(torch.Size([31, 5, 256]), torch.Size([2, 5, 128]), torch.Size([2, 5, 128]))

In [42]:
class Decoder(nn.Module):
    def __init__(self,input_size,hidden_size, output_size, num_layers,decoder_dropout):
        super(Decoder, self).__init__()

        self.num_layers=num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)
        
        self.output_layer = nn.Linear(hidden_size, output_size)
        self.dropout=nn.Dropout(decoder_dropout)
    def forward(self, input_seq, hidden,cell):
        #input_seq:shape=batchsize
        #input_seq.unsqeeze(0):shape=1,batchsize
        embedded = self.dropout(target_embed(input_seq.unsqueeze(0)))

        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))

        # Pass the LSTM output through the output layer to get the output
        pred = self.output_layer(output)
        pred.squeeze(0)
        return pred,hidden,output
        #prde=output=1,en_vocabsize
        #hidden=num_layers,hiddensize
        #cell=num_layers,hiddensize

In [None]:
a=en_index(target[1],1)
target_embed(a[0].unsqueeze(0)).shape,a[0].unsqueeze(0).shape

(torch.Size([1, 10]), torch.Size([1]))

In [46]:
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=2,decoder_dropout=0.5)
input_seq = en_index(target[:5],5)[0] # replace with your input
 # use the final hidden state from the encoder
decoder_output, decoder_hidden,decoder_cell = decoder(input_seq, encoder_hidden,encoder_cell)
decoder_output.shape,decoder_hidden.shape,decoder_cell.shape

(torch.Size([1, 5, 10612]), torch.Size([2, 5, 128]), torch.Size([1, 5, 128]))

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, input_seq, target_seq,batch_size):
        target_len = 31
        target_vocab_size = len(en_vocab)
        outputs = torch.zeros(target_len, batch_size, target_vocab_size)
        encoder_output, encoder_hidden, encoder_cell = self.encoder(input_seq,batch_size)
        decoder_input = target_seq[0]
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(decoder_input, encoder_hidden, encoder_cell)
            outputs[t] = output
            decoder_input = target_seq[t]
        return outputs#shape=seq_len,batchsize,en_vocabsize

In [None]:
encoder = Encoder(input_size=10, hidden_size=128,num_layers=1,encoder_dropout=0.5)
encoder_output, encoder_hidden, encoder_cell = encoder(input[:5],5)
decoder = Decoder(input_size=10, hidden_size=128, output_size=len(en_vocab), num_layers=2,decoder_dropout=0.5)
decoder_output, decoder_hidden,decoder_cell = decoder(en_index(target[:5],5)[0], encoder_hidden,encoder_cell)

In [None]:
encoder = Encoder(input_size=10, hidden_size=1024,num_layers=1,encoder_dropout=0.5)
decoder = Decoder(input_size=10, hidden_size=1024, output_size=len(en_vocab), num_layers=2,decoder_dropout=0.5)
model=Seq2Seq(encoder,decoder)

out=model(input[:5],en_index(target[:5],5),batch_size=5)
out.shape

torch.Size([31, 5, 10612])

In [None]:
out.permute(1,0,2)

In [None]:
num_epochs= 50
learning_rate = 0.001
batch_size = 64

In [None]:
encoder = Encoder(input_size=10, hidden_size=1024,num_layers=2,encoder_dropout=0.5)
decoder = Decoder(input_size=10, hidden_size=1024, output_size=len(en_vocab), num_layers=4,decoder_dropout=0.5)
model=Seq2Seq(encoder,decoder)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 
loss_fn = nn.CrossEntropyLoss()

In [None]:
en_index(target[:5],5).contiguous().view(-1).shape
out.view(-1, out.shape[-1]).shape

In [None]:
loss_fn(out.view(-1, out.shape[-1]),en_index(target[:5],5).contiguous().view(-1))

In [None]:
'''
trg=trg.type(torch.LongTensor)
out=out.type(torch.LongTensor)
'''

In [None]:
scores = torch.zeros((31,5,10612))  # shape: (batch_size=10, num_classes=5)
labels = torch.ones((31,5,10612),dtype=torch.long)  # shape: (batch_size=10)

# Define the cross-entropy loss function

# Compute the loss
loss_fn(scores, labels)

In [None]:
# Driving training loop
BATCH=20
model.train()
c=0
#for epoch in tqdm(range(1,num_epochs+1)):
for epoch in range(3):
    total_epoch_loss=0
    # Iterate through train dataset
    for i in range(0,100,BATCH):
        # 1. forward pass the inputs through the model
        output =model(input[i:i+BATCH],en_index(target[i:i+BATCH],BATCH),batch_size=BATCH).view(-1, out.shape[-1])
        trg=en_index(target[i:i+BATCH],BATCH).contiguous().view(-1)

        optimizer.zero_grad()
        
        loss = loss_fn(output,trg)                                 
        total_epoch_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1)
        # 4. update the parameters
        optimizer.step()
        print(i)

    print(f"Epoch: [{epoch}/{num_epochs}] Epoch Loss: {total_epoch_loss}")

0
20
40
60
80
Epoch: [0/50] Epoch Loss: 28.915438652038574
0
20
40
60
80
Epoch: [1/50] Epoch Loss: 10.098323345184326
0
20
40
60
80
Epoch: [2/50] Epoch Loss: 9.379982590675354


In [None]:
def translate_sentence(model, sentence,max_length=31):


    with torch.no_grad():
        encoder_output,encoder_hidden, encoder_cell = model.encoder(sentence,1)
    out=[en_word2ix['<sos>']]
    for i in range(max_length):
        previous_word = torch.tensor(out[-1])

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word,encoder_hidden,encoder_cell)
            best_guess = output.argmax(1).item()

        out.append(best_guess)

        if output.argmax(1).item() == en_word2ix['<eos>']:
            break

    translated_sen = [en_ix2word[idx] for idx in out]

    return translated_sen[1:]

In [None]:
sent=input[109]
translate_sentence(model,sent,)

['<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>',
 '<pad>']

In [None]:
'''
input=np.array([tokenise(train_preprocessed_input[0])])
pad=np.full((50-len(input[0])),'<pad>')
input=np.concatenate((input.squeeze(0),pad),axis=0)
for line in train_preprocessed_input:
  sent=np.array([tokenise(line)])
  pad=np.full((50-len(sent[0])),'<pad>')
  sent=np.concatenate((sent.squeeze(0),pad),axis=0)
  input=np.vstack([input, sent])
  '''

In [None]:
target=[]
for line in train_preprocessed_input:
  input.append(tokenise(line))

In [None]:
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("spacy")
