In [1]:
#importing all the necessary libraries
import random
import pandas as pd
import os
import torch
import nltk as nltk
import re
import tqdm
from collections import Counter
import pickle
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy

In [2]:
#setting the pytorch device to cuda
# Check if CUDA (GPU support) is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"GPU: {torch.cuda.get_device_name(0)}")  # Print GPU name
else:
    device = torch.device("cpu")
    print("CUDA is not available. Training on CPU...")

GPU: NVIDIA GeForce GTX 1650 Ti


In [2]:
#loading the english-german euro dataset
CD = "./Seq2Seq Model/DataSet/EN-DE/"
#source language is english
SL = 'EN'
#translated language is german
TL = 'DE'
df = pd.read_csv("D:\projects\Seq2Seq Model\DataSet\EN-DE\EN-DE.txt", sep = "\t", header= None)[[0,1]].rename(columns = {0:SL, 1:TL})

  df = pd.read_csv("D:\projects\Seq2Seq Model\DataSet\EN-DE\EN-DE.txt", sep = "\t", header= None)[[0,1]].rename(columns = {0:SL, 1:TL})


In [3]:
df.head(5693624)

Unnamed: 0,EN,DE
0,Commission Regulation (EC) No 1788/2004,Verordnung (EG) Nr. 1788/2004 der Kommission
1,of 15 October 2004,vom 15. Oktober 2004
2,fixing the minimum selling prices for butter f...,zur Festsetzung der Mindestverkaufspreise für ...
3,"THE COMMISSION OF THE EUROPEAN COMMUNITIES,",DIE KOMMISSION DER EUROPÄISCHEN GEMEINSCHAFTEN —
4,Having regard to the Treaty establishing the E...,gestützt auf den Vertrag zur Gründung der Euro...
...,...,...
5693619,Third country code [1],Drittland-Code [1]
5693620,Standard import value,Pauschaler Einfuhrpreis
5693621,Country nomenclature as fixed by Commission Re...,Nomenklatur der Länder gemäß der Verordnung (E...
5693622,Code ‘999’ stands for ‘of other origin’.,Der Code „999“ steht für „Verschiedenes“.


In [None]:
#function that tokenises a given string and returns a list of individual words
def tokenize(text):
    text = text.lower()
    # removes punctuations and split into words
    tokens = re.findall(r'\b\w+\b', text)
    return tokens

In [None]:
#building the english and german dictionaries 
english_words = []
german_words = []
error_count = 0 
#before, we use the lists to store all the words inside the corpus
for i in tqdm.tqdm(range(len(df))):
    try:
        #loading the english and german text strings for a certain row
        english_text_array = df['EN'].values[i:i+1]
        english_text = english_text_array[0]
        german_text_array = df['DE'].values[i:i+1]
        german_text = german_text_array[0]
        
        
        #step to remove all the punctuations and non-words
        english_text = tokenize(english_text)
        german_text = tokenize(german_text)
    except Exception as e:
        error_count = error_count+1
        continue
    
    #if the try block of code runs then we can push the words into their respective lists
    english_words.extend(english_text)
    german_words.extend(german_text)
print("Number of times the pre-processing failed: ",error_count)
print("Number of elements in german token list is: ",len(german_words))

# building vocabulary and storing the most frequent words
def build_vocab(tokens,max_size):
    #Count the frequency of each item
    counter = Counter(tokens)
    # sort by frequency and then alphabetically 
    sorted_vocab = sorted(counter.items(), key= lambda x: (-x[1],x[0]))
    # create dictionary mapping each token to a unique indx
    # we leave 4 spaces for special token entries
    vocab_words = {word: idx for idx, (word, _) in enumerate(sorted_vocab,start = 4)}
    
    extra_words = (len(vocab_words)-max_size)+4
    for i in range(extra_words):
        vocab_words.popitem()
    
    # special tokens
    vocab_words['<PAD>'] = 0
    vocab_words['<SOS>'] = 1
    vocab_words['<EOS>'] = 2
    vocab_words['<UNK>'] = 3 
    return vocab_words


english_dict = build_vocab(english_words,20000)
german_dict = build_vocab(german_words,40000)
#saving the dictionary 
with open("./dictionary/english_dict.pickle", "wb") as file:
    pickle.dump(english_dict,file)
with open("./dictionary/german_dict.pickle", "wb") as file:
    pickle.dump(german_dict,file)

In [3]:
#loading the dictionaries
import pickle
with open("./dictionary/english_dict.pickle", "rb") as file:
    english_dict = pickle.load(file)
with open("./dictionary/german_dict.pickle", "rb") as file:
    german_dict = pickle.load(file)

In [4]:
class Encoder(nn.Module):
    def __init__(self,input_size, embedding_size, hidden_size,num_layers, p):
        super(Encoder,self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size,embedding_size)
        self.rnn = nn.LSTM(embedding_size,hidden_size, num_layers, dropout=p)
        
    def forward(self,x):
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden, cell) = self.rnn(embedding)
        
        return hidden, cell

In [5]:
class Decoder(nn.Module):
    def __init__(self,input_size,embedding_size,hidden_size,output_size,num_layers,p):
        super(Decoder, self).__init__()
        self.hidden_Size = hidden_size
        self.num_layers = num_layers
        
        self.dropout = nn.Dropout(p)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)
        
    def forward(self,x,hidden, cell):
        x = x.unsqueeze(0)
        
        embedding = self.dropout(self.embedding(x))
        outputs, (hidden,cell) = self.rnn(embedding,(hidden,cell))
        
        predictions = self.fc(outputs)
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

In [6]:
class Seq2Seq(nn.Module):
    def __init__(self,encoder,decoder):
        super(Seq2Seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        
    def forward(self,source,target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        
        target_vocab_len = len(english_dict)
        
        outputs = torch.zeros(target_len,batch_size,target_vocab_len).to(device)
        
        hidden, cell = self.encoder(source)
        #grab start token
        x = target[0]
        
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output
            
            best_guess = output.argmax(1)
            
            x = target[t] if random.random() < teacher_force_ratio else best_guess
            
        return outputs
            


In [7]:
#dataset class and dataloader
class english_german_translation(Dataset):
    def __init__(self,english_dict,german_dict):
        CD = "./Seq2Seq Model/DataSet/EN-DE/"
        #source language is english
        SL = 'EN'
        #translated language is german
        TL = 'DE'
        df = pd.read_csv("D:\projects\Seq2Seq Model\DataSet\EN-DE\EN-DE.txt", sep = "\t", header= None)[[0,1]].rename(columns = {0:SL, 1:TL})
        self.data = df["DE"].values
        self.target = df["EN"].values
        
        self.english_dict = english_dict
        self.german_dict = german_dict
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x = self.data[idx]
        y = self.target[idx]
        
        x = self.string_to_tokens(x,self.german_dict,True)
        y = self.string_to_tokens(y,self.english_dict,False)
        
        return x,y
            
    def string_to_tokens(self,string,dict,gate):
        string = string.lower()
        # removes punctuations and split into words
        string = re.findall(r'\b\w+\b', string)
        indices = [dict.get(word,3) for word in string]
        if gate == True:
            indices = list(reversed(indices))
        tensor = torch.tensor(indices, dtype=torch.long)
        return tensor
        
def custom_collate_fn(batch):
    # Unzip the batch of tuples (x, y)
    source_sequences, target_sequences = zip(*batch)
    
    # Find the maximum lengths for padding
    max_source_len = max(len(seq) for seq in source_sequences)
    max_target_len = max(len(seq) for seq in target_sequences)
    
    # Initialize padded tensors with padding index (e.g., 0 or a custom padding index)
    pad_index = 0  # Adjust this if you have a specific padding index
    padded_sources = torch.full((len(batch), max_source_len), pad_index, dtype=torch.long)
    padded_targets = torch.full((len(batch), max_target_len), pad_index, dtype=torch.long)
    
    for i, (source, target) in enumerate(batch):
        padded_sources[i, :len(source)] = source
        padded_targets[i, :len(target)] = target
    
    # Transpose the padded sources and targets to (sequence_length, batch_size)
    padded_sources = padded_sources.transpose(0, 1)
    padded_targets = padded_targets.transpose(0, 1)
    
    return padded_sources, padded_targets


  df = pd.read_csv("D:\projects\Seq2Seq Model\DataSet\EN-DE\EN-DE.txt", sep = "\t", header= None)[[0,1]].rename(columns = {0:SL, 1:TL})


In [8]:
#Hyper parameters
num_epochs = 10
learning_Rate = 0.001
batch_size = 64


input_size_encoder = len(german_dict)
input_Size_Decoder = len(english_dict)
output_size = len(english_dict)
encoder_embedding_Size = 300
decoder_embedding_Size = 300
hidden_Size = 1024
num_layers = 2 
encoder_dropout = 0.5
decoder_dropout = 0.5

In [9]:
#intialising the dataset class and dataloader
dataset = english_german_translation(english_dict,german_dict)
train_dataloader = DataLoader(dataset,batch_size,shuffle=True, collate_fn=custom_collate_fn,drop_last=True)

In [11]:
#intialising our model
encoder_net = Encoder(input_size_encoder, encoder_embedding_Size, hidden_Size, num_layers, encoder_dropout).to(device)
decoder_net = Decoder(input_Size_Decoder, decoder_embedding_Size, hidden_Size,output_size, num_layers, decoder_dropout).to(device)
model = Seq2Seq(encoder_net,decoder_net).to(device)
pad_idx = english_dict['<PAD>']
criterion =  nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(),lr=learning_Rate)

In [13]:
loss_history = []
PATH = "./DataSet/Model/model.pth"
for epoch in range(num_epochs):
    print(f"Epoch [{epoch}/{num_epochs}")
    i = 0
    for batch_idx, (batch_data,batch_labels) in enumerate(train_dataloader):
        input_data = batch_data.to(device)
        target = batch_labels.to(device)
        
        output = model(input_data,target)
        output = output[1:].reshape(-1,output.shape[2])
        target = target[1:].reshape(-1)
        
        optimizer.zero_grad()
        loss = criterion(output,target)
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        
        optimizer.step()
        
        # Print loss after each iteration
        print(f"Epoch [{epoch+1}/{num_epochs}], Batch [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
        loss_history.append(loss.item())
        i = i+1
        if i ==10000:
            break
        
torch.save(model.state_dict(),PATH)
print("Model saved successfully.") 
        

Epoch [0/10
Epoch [1/10], Batch [1/88962], Loss: 9.9055
Epoch [1/10], Batch [2/88962], Loss: 9.8603
Epoch [1/10], Batch [3/88962], Loss: 9.5364
Epoch [1/10], Batch [4/88962], Loss: 8.2251
Epoch [1/10], Batch [5/88962], Loss: 7.6569
Epoch [1/10], Batch [6/88962], Loss: 7.4700
Epoch [1/10], Batch [7/88962], Loss: 7.5299
Epoch [1/10], Batch [8/88962], Loss: 7.2931
Epoch [1/10], Batch [9/88962], Loss: 7.2175
Epoch [1/10], Batch [10/88962], Loss: 7.1538
Epoch [1/10], Batch [11/88962], Loss: 7.3924
Epoch [1/10], Batch [12/88962], Loss: 7.2614
Epoch [1/10], Batch [13/88962], Loss: 7.3036
Epoch [1/10], Batch [14/88962], Loss: 7.0462
Epoch [1/10], Batch [15/88962], Loss: 6.9511
Epoch [1/10], Batch [16/88962], Loss: 6.8110
Epoch [1/10], Batch [17/88962], Loss: 7.1339
Epoch [1/10], Batch [18/88962], Loss: 7.0789
Epoch [1/10], Batch [19/88962], Loss: 6.8565
Epoch [1/10], Batch [20/88962], Loss: 7.2047
Epoch [1/10], Batch [21/88962], Loss: 7.0324
Epoch [1/10], Batch [22/88962], Loss: 7.2018
Epoch [

  df = pd.read_csv("D:\projects\Seq2Seq Model\DataSet\EN-DE\EN-DE.txt", sep = "\t", header= None)[[0,1]].rename(columns = {0:SL, 1:TL})


AttributeError: 'float' object has no attribute 'lower'

Model saved successfully.
