# Neural Machine Translator

In [None]:
%cd /content/drive/MyDrive/Colab/NMT
!ls

/content/drive/MyDrive/Colab/NMT
NMT


In [None]:
!mkdir Data
!ls

Data  NMT
/content/drive/MyDrive/Colab/NMT/Data


In [None]:
%cd /content/drive/MyDrive/Colab/NMT/Data

## Loading Dataset and Converting to model input format

### Downloading Wikipedia English-Spanish Dataset from OPUS

In [None]:
!wget -O enes.zip http://opus.nlpl.eu/download.php?f=Wikipedia/v1.0/moses/en-es.txt.zip

--2021-04-21 06:11:01--  http://opus.nlpl.eu/download.php?f=Wikipedia/v1.0/moses/en-es.txt.zip
Resolving opus.nlpl.eu (opus.nlpl.eu)... 193.166.25.9
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://opus.nlpl.eu/download.php?f=Wikipedia/v1.0/moses/en-es.txt.zip [following]
--2021-04-21 06:11:02--  https://opus.nlpl.eu/download.php?f=Wikipedia/v1.0/moses/en-es.txt.zip
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://object.pouta.csc.fi/OPUS-Wikipedia/v1.0/moses/en-es.txt.zip [following]
--2021-04-21 06:11:03--  https://object.pouta.csc.fi/OPUS-Wikipedia/v1.0/moses/en-es.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1894458

### Unzipping and checking the shape

In [None]:
!unzip enes.zip
!wc -lw Wikipedia.en-es.en Wikipedia.en-es.es

Archive:  enes.zip
  inflating: Wikipedia.en-es.en      
  inflating: Wikipedia.en-es.es      
  inflating: Wikipedia.en-es.ids     
  inflating: README                  
  1811428  35383771 Wikipedia.en-es.en
  1811428  36136487 Wikipedia.en-es.es
  3622856  71520258 total


In [None]:
with open('Wikipedia.en-es.ids', 'r') as id:
    id = id.read().split('\n')

l = len(id)

In [None]:
with open('Wikipedia.en-es.en', 'r') as enFile, open('Wikipedia.en-es.es', 'r') as esFile:
    en = enFile.read().split('\n')
    es = esFile.read().split('\n')

data = ''
for i in range(l):
    data += en[i] + '\t' + es[i] + '\n'

with open('Data-en-es.txt', 'w') as dataFile:
    dataFile.write(data)

## Importing Libraries and Installing Files

In [None]:
import torch
import torch.cuda
import unicodedata
import string
from random import shuffle
import math

### Checking for GPU

In [None]:
GPU = torch.cuda.is_available()
print(GPU)

True


## Preprocessing

### Cleaning the data
*   Unicode string to plain ASCII
*   Converting into lower case and removing punctuation
*   Removing sentences which have word length greater than max length given

In [None]:
def cleanData(s):
    sent = ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')  # Mn stands for Nonspacing_Mark
    return sent.lower().translate(sent.maketrans('', '', string.punctuation))

def filterPairs(pairs, maxLength):
    return [p for p in pairs if len(p[0].split()) < maxLength and len(p[1].split()) < maxLength]

### Creating One Hot Encoded vectors for the input and output languages

In [None]:
class Lang:
    def __init__(self, language):
        self.langName = language
        self.wordToIndex = {'SOS':0, 'EOS':1, '<UNK>':2}
        self.wordToCount = {}
        self.indexToWord = {0:'SOS', 1:'EOS', 2:'<UNK>'}
        self.vocabSize = 3
        self.cutoff = -1

    # Counts the occurence of each token and saves to wordToCount
    def countTokens(self, sent):
        for word in sent.split():
            if word not in self.wordToCount:
                self.wordToCount[word] = 1
            else:
                self.wordToCount[word] += 1

    # Adds each unique token into wordToIndex and indexToWord attributes with unique index
    # Replacing words with less frequency with <UNK> 
    def addTokens(self, sent):
        newSent = ''
        for word in sent.split():
            newWord = ''
            if self.wordToCount[word] > self.cutoff:
                if word not in self.wordToIndex:
                    self.wordToIndex[word] = self.vocabSize
                    self.indexToWord[self.vocabSize] = word
                    self.vocabSize += 1
                newWord = word
            else:
                newWord = self.indexToWord[2]

            newSent += newWord + ' ' 

        return newSent

    # Creates a cutoff to leave infrequent words
    def createCutoff(self, vocSizeMax):
        frequency = list(self.wordToCount.values()).sort(reverse=True)

        if len(frequency) > vocSizeMax:
            self.cutoff = frequency[vocSizeMax]

### Creating Input and Output Language datasets

In [None]:
# def langPrepare(lang1, lang2, path, rev=False):
#     sentToken = open(path, encoding='utf-8').read().split('\n')
#     pairs = [[cleanData(s) for s in t.split('\t')] for t in sentToken]

#     if reverse:
#         pairs = [list(reversed(p)) for p in pairs]
#         inpLang = Lang(lang2)
#         outLang = Lang(lang1)
#     else:
#         inpLang = Lang(lang1)
#         outLang = Lang(lang2)

#     return inpLang, outLang, pairs

In [None]:
def dataPrepare(lang1, lang2, path, vocSizeMax=50000, rev=False, maxLength=0, div=0.9):
    # inpLang, outLang, pairs = langPrepare(lang1, lang2, path, rev)
    sentToken = open(path, encoding='utf-8').read().split('\n')
    pairs = [[cleanData(s) for s in t.split('\t')] for t in sentToken]

    if rev:
        pairs = [list(reversed(p)) for p in pairs]
        inpLang = Lang(lang2)
        outLang = Lang(lang1)
    else:
        inpLang = Lang(lang1)
        outLang = Lang(lang2)

    if maxLength != 0:
        pairs = filterPairs(pairs, maxLength)

    for pair in pairs:
        inpLang.countTokens(pair[0])
        outLang.countTokens(pair[1])

    inpLang.createCutoff(vocSizeMax)
    outLang.createCutoff(vocSizeMax)

    pairs = [(inpLang.addTokens(pair[0]), outLang.addTokens(pair[1])) for pair in pairs]

    shuffle(pairs)

    trainData = pairs[:math.ceil(div * len(pairs))]
    testData = pairs[math.ceil(div * len(pairs)):]

    print(f"Train: {len(trainData)} Test: {len(testData)}")
    print("Counted Words -> Trimmed Vocabulary Sizes (w/ EOS and SOS tags):")
    print("%s, %s -> %s" % (inpLang.langName, len(inpLang.wordToCount),
                            inpLang.vocabSize))
    print("%s, %s -> %s" % (outLang.langName, len(outLang.wordToCount), 
                            outLang.vocabSize))
    
    return inpLang, outLang, trainData, testData

### Converting to and from One Hot Encoded vectors

In [None]:
def sentToTensor(lang, sent):
    ind = [lang.wordToIndex[word] if word in lang.wordToIndex else lang.wordToIndex["<UNK>"] for word in sent.split()]
    ind.append(1)  # EOS Token
    res = torch.LongTensor(ind).view(-1)  # Converting into long tensor and reshaping the tensor

    if GPU:
        return res.cuda()
    else:
        return res

def tensorToSent(lang, tens):
    return ' '.join([lang.indexToWord[i.item()] for i in tensor.data])

In [None]:
def pairToTensor(lang1, lang2, pair):
    return (sentToTensor(lang1, pair[0]), sentToTensor(lang2, pair[1]))

### Creating batches to train using Mini Batch Gradient Descent

In [None]:
def miniBatch(pairs, lang1, lang2, size):
    shuffle(data)

    nBatches = len(pairs) // size

    batchList = longElemList = [0 for _ in range(nBatches)]
    
    for i in range(nBatches):
        lInp = lOut = 0
        vInp = vOut = [0 for _ in range(size)]
        
        for j in range((i * size), ((i + 1) * size)):
            vInp[j], vOut[j] = tensorsFromPair(lang1, lang2, pairs[j])

            if len(vInp[j]) >= lInp:
                lInp = len(vInp[j])
            if len(vOut[j]) >= lOut:
                lOut = len(vOut[j])
        
        batchList[i] = (vInp, vOut)
        longElemList[i] = (lInp, lOut)

    return batchList, longElemList, nBatches

### Padding with <EOS> Token to make every batch of equal length

In [None]:
def padding(batch):
    inp = torch.nn.utils.rnn.pad_sequence(batch[0], padding_value=1)
    out = torch.nn.utils.rnn.pad_sequence(batch[1], padding_value=1)
    return (inp, out)