In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='mps')

# Loading Data
Data for this project is a set of thousands of English to French translation pairs

One hot encoding will be done as follows -
<p align="center">
<img src="../images/word-encoding.png" style="width:450px;height:250px;">

Where -

**SOS**: Start of Sentence token

**EOS**: End of Sentence token

### How to achieve it?
We will need a unique index per word to use as the inputs and targets of the network later. To keep track of all this, we will use a helper class called `Lang`, which has word → index (`word2index`) and index → word (`index2word`) dictionaries, as well as a count of each word `word2count` which will be used to replace **rare** words later.

In [5]:
SOS_token = 0
EOS_token = 1

In [6]:
class Lang:
    def __init__(self, name) -> None:
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"} # Setting SOS to be 0, and EOS to be 1
        self.n_words = 2 # Count SOS and EOS
    
    def addWord(self, word):
        if word not in self.word2index:             # Checks if the word is already used in the sentence
            self.word2index[word] = self.n_words    # Adds the word and gives the index of the word in the sentence 
            self.index2word[self.n_words] = word    # Adds the index of the word and the word based on the figure two cells above  
            self.word2count[word] = 1               # Counts number of times the word is used in a sentence (Shows rarity of word) 
            self.n_words += 1                       # Increases index pointer by 1
        else:
            self.word2count[word] +=1               # Increases count of a particular word which has been repeated
    
    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)                      # For every word in the sentence, call addWord() method

### Converting Unicode literals to ASCII

In [7]:
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

### Lowercase, trim and removing non-letter characters

In [8]:
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip()) # Send the string characterwise to unicodeToAscii()
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [9]:
def readLangs(lang1, lang2, reverse = False):
    print('Reading Lines...')

    # Read the file and split into lines
    lines = open('../datasets/english-french data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n') # reads the eng-fre.txt file
    
    # Split every line into pairs of english and french text and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]
    
    # Reverse pairs if reverse = True
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)
    
    return input_lang, output_lang, pairs