In [1]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

### Dataset

In [2]:
import os

sentences_file_path = '../Data/Dutch-English-SentencePairs/eng-nld.txt'
# embeddings_file_path = '../Data/glove-6B-WordEmbeddings/glove.6B.100d.txt'

In [3]:
SOS_token = 0
EOS_token = 1


class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2  # Count SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [4]:
# Preprocess

In [5]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters


def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [9]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
#     lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').read().strip().split('\n')
    lines = open(sentences_file_path, encoding='utf-8').read().strip().split('\n')
    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')[0:2]] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

In [12]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re ", 
    "hi"
)


def filterPair(p):
#     print(p)
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

In [13]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs


input_lang, output_lang, pairs = prepareData('eng', 'nld', True)
print(random.choice(pairs))

Reading lines...
Read 72911 sentence pairs
Trimmed to 4855 sentence pairs
Counting words...
Counted words:
nld 2482
eng 2098
['ik ben maria s vriend .', 'i m mary s boyfriend .']


## Pre-process sentence data

#### Three lists of sentences must be generated for this LSTM network:


1.   input_sentences (consisting of pure Dutch sentences with no processing to be done)

2.   input_translated_sentences (consisting of translated English sentences with a starting token denoted as <begin\> which the encoder LSTM accepts)

3.   output_translated_sentences (consisting of translated English sentences with an ending token denoted as <end\> that is the target output of the neural network)


In [18]:
import re

# Designate a total sentence count for the LSTM to learn from.
SENTENCE_COUNT = 40000

input_sentences = []
input_translated_sentences = []
output_translated_sentences = []

count = 0
for line in open(sentences_file_path, encoding='utf-8'):
    if count < SENTENCE_COUNT:
        if '\t' in line:
            # Removes spaces/punctuation and splits each line based on tab.
            # First value: English phrase, Second value: Dutch phrase
            line = re.sub(r'[^\w\s]','', line)
            line_values = line.rstrip().split('\t')
            input_sentences.append(line_values[1])

            # Tokens are symbols for 'start' and 'end' respectively in Chinese.
            # This ensures no confusion between English words 'start' and 'end'.
            input_translated_sentence = '走 ' + line_values[0]
            output_translated_sentence = line_values[0] + ' 停'

            input_translated_sentences.append(input_translated_sentence)
            output_translated_sentences.append(output_translated_sentence)
            count += 1
    else:
        break

In [19]:
print(input_sentences[0:5])
print(input_translated_sentences[0:5])
print(output_translated_sentences[0:5])

['Lopen', 'Vooruit', 'Hoi', 'Hé', 'Hai']
['走 Go', '走 Go', '走 Hi', '走 Hi', '走 Hi']
['Go 停', 'Go 停', 'Hi 停', 'Hi 停', 'Hi 停']


### Tokenization

##### Input Sentences Tokenizer

In [20]:
from tensorflow.keras.preprocessing.text import Tokenizer

VOCABULARY_SIZE = 25000

# Instantiate a Tokenizer instance which takes in a max vocabulary size.
input_tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
input_tokenizer.fit_on_texts(input_sentences)

# Extract numerical sequences of each input sentence.
input_sequences = input_tokenizer.texts_to_sequences(input_sentences)

# Extract word:index dictionary of each word in the input sentences.
word_index_dict = input_tokenizer.word_index
word_count = len(word_index_dict) + 1

# Find maximum sentence length out of all input sentences.
max_input_length = max(len(seq) for seq in input_sequences)

2023-03-20 13:58:05.408693: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


##### Output Sentences Tokenizer

In [21]:
# Same process as before.
output_tokenizer = Tokenizer(num_words=VOCABULARY_SIZE)
output_tokenizer.fit_on_texts(input_translated_sentences + output_translated_sentences)

# Extract numerical sequences of each input/output translated sentence.
input_translated_sequences = output_tokenizer.texts_to_sequences(input_translated_sentences)
output_translated_sequences = output_tokenizer.texts_to_sequences(output_translated_sentences)

# Retrieve a word:index dictionary of each word in the input/output translated sentences.
translated_word_index_dict = output_tokenizer.word_index
translated_word_count = len(translated_word_index_dict) + 1

# Find maximum input length for padding.
max_output_length = max([len(seq) for seq in output_translated_sequences])

### Padding

In [22]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

encoder_input_sequences = pad_sequences(input_sequences, maxlen=max_input_length)
decoder_input_sequences = pad_sequences(input_translated_sequences, maxlen=max_output_length, padding='post')
decoder_output_sequences = pad_sequences(output_translated_sequences, maxlen=max_output_length, padding='post')

### Embedding

In [23]:
import numpy as np
from numpy import array, asarray, zeros

embeddings_dict = dict()
glove_file = open(embeddings_file_path)

for line in glove_file:
    records = line.split() # Turn into array with word on first position and embeddings as rest of line.
    word = records[0]
    vector_dim = asarray(records[1:], dtype='float32') # Take rest of embeddings out.
    embeddings_dict[word] = vector_dim # Add to embeddings_dict as word:embeddings.