In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np

# from torchtext.datasets import TranslationDataset, Multi30k
# from torchtext.data import Field, BucketIterator

# import spacy

import pickle
import random
import math
import os
import time
import nltk

In [3]:
train_en = []
train_de = []
with open("data/train.en", "r", encoding="utf8") as f:
    train_en = f.readlines()
with open("data/train.de", "r", encoding="utf8") as f:
    train_de = f.readlines()
with open("data/dev.en", "r", encoding="utf8") as f:
    dev_en = f.readlines()
with open("data/dev.de", "r", encoding="utf8") as f:
    dev_de = f.readlines()
    
print(len(train_de))
print(len(train_en))

29001
29001


In [4]:
# Get rid of any white space or \n's
nltk.download('punkt')
start = "<sos>"
end = "<eos>"
pad = "<pad>"
unk = "<unk>"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\evan_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Use the natural language toolkit to tokenize the words

In [5]:
start = "<sos>"
end = "<eos>"
pad = "<pad>"
max_length_en = -1
for i, sentence in enumerate(train_en):
    sentence = nltk.word_tokenize(sentence.lower())
    sentence = [start] + sentence
    sentence.append(end)
    max_length_en = len(sentence) if len(sentence) > max_length_en else max_length_en
    train_en[i] = sentence
    

max_length_de = -1
for i, sentence in enumerate(train_de):
    sentence = nltk.word_tokenize(sentence.lower())
    sentence = [start] + sentence
    sentence.append(end)
    max_length_de = len(sentence) if len(sentence) > max_length_de else max_length_de
    train_de[i] = sentence
    
    
max_length_en_dev = -1
for i, sentence in enumerate(dev_en):
    sentence = nltk.word_tokenize(sentence.lower())
    sentence = [start] + sentence
    sentence.append(end)
    max_length_en_dev = len(sentence) if len(sentence) > max_length_en_dev else max_length_en_dev
    dev_en[i] = sentence

max_length_de_dev = -1
for i, sentence in enumerate(dev_de):
    sentence = nltk.word_tokenize(sentence.lower())
    sentence = [start] + sentence
    sentence.append(end)
    max_length_de_dev = len(sentence) if len(sentence) > max_length_de_dev else max_length_de_dev
    dev_de[i] = sentence

# =============== WARNING ===============

## The following code cell deletes a number of sentences from the training set in order to get the dimensionality of the sentences below a certain threshold. The threshold is the value in the inequality of the while loop

### Removing the longest sentences from the training dataset to decrease the dimensionality of all the sentences.

In [5]:
# DEPRECATED METHOD

# max_idx, en_max_val = max(enumerate(train_en), key=lambda x: len(x[1]))
# _, de_max_val = max(enumerate(train_de), key=lambda x: len(x[1]))

# del_count = 0
# print(max_idx, len(en_max_val), len(de_max_val))

# while len(en_max_val) > 200:
#     del train_en[max_idx]
#     del train_de[max_idx]
    
#     max_idx, en_max_val = max(enumerate(train_en), key=lambda x: len(x[1]))
#     _, de_max_val = max(enumerate(train_de), key=lambda x: len(x[1]))
    
#     print(max_idx, len(en_max_val), len(de_max_val))
    
#     del_count += 1
    
# max_length_en = len(en_max_val)
# max_length_de = len(de_max_val)

In [6]:
# print("Deleted:", del_count)

print("Number of Sentences (training):", len(train_en))
print("Number of Sentences (dev):", len(dev_en))

print("Max length English (training):", max_length_en)
print("Max length German (training):", max_length_de)

print("Max length English (dev):", max_length_en_dev)
print("Max length German (dev):", max_length_de_dev)

Number of Sentences (training): 29001
Number of Sentences (dev): 1015
Max length English (training): 42
Max length German (training): 46
Max length English (dev): 32
Max length German (dev): 35


## Choose how large you want the training and validation sets to be
## Don't run this cell if you want to use the entire dataset

In [6]:
TRAIN_SIZE = 100000 # Number from 1 to 196884
VAL_SIZE = 3500 # Number from 1 to 7883

idxs_train = list(range(len(train_en)))
random.shuffle(idxs_train)
idxs_train = idxs_train[0:TRAIN_SIZE]

train_en = [train_en[i] for i in idxs_train]
train_de = [train_de[i] for i in idxs_train]

idxs_val = list(range(len(dev_en)))
random.shuffle(idxs_val)
idxs_val = idxs_val[0:VAL_SIZE]

dev_en = [dev_en[i] for i in idxs_val]
dev_de = [dev_de[i] for i in idxs_val]

## The following cell gets the frequency of words in the dataset

In [7]:
en_freq = {'<pad>':100, # there wont be any padding til later, this is just to
           '<unk>':2,   # avoid removing the padding token
           '<sos>':0,
           '<eos>':0}

de_freq = {'<pad>':100, # there wont be any padding til later, this is just to
           '<unk>':2,   # avoid removing the padding token
           '<sos>':0,
           '<eos>':0}

for sent in train_en:
    for w in sent:
        if w not in en_freq:
            en_freq[w] = 1
        else:
            en_freq[w] += 1
    
for sent in train_de:
    for w in sent:
        if w not in de_freq:
            de_freq[w] = 1
        else:
            de_freq[w] += 1
    
for sent in dev_en:
    for w in sent:
        if w not in en_freq:
            en_freq[w] = 1
        else:
            en_freq[w] += 1
    
for sent in dev_de:
    for w in sent:
        if w not in de_freq:
            de_freq[w] = 1
        else:
            de_freq[w] += 1


## Make a list of uncommon words

In [8]:
en_uncommon = {}
de_uncommon = {}

for word, freq in en_freq.items():
    if freq < 2:
        en_uncommon[word] = len(en_uncommon)
        
for word, freq in de_freq.items():
    if freq < 2:
        de_uncommon[word] = len(de_uncommon)

if '<unk>' in en_uncommon:
    print("deleted")
    del en_uncommon['<unk>']

if '<unk>' in de_uncommon:
    print("deleted")
    del de_uncommon['<unk>']

## Replace the uncommon words with unknown tokens

In [9]:
en_vocab = {}
de_vocab = {}
en_vocab['<pad>'] = 0
en_vocab['<unk>'] = 1 
en_vocab['<sos>'] = 2
en_vocab['<eos>'] = 3

de_vocab['<pad>'] = 0
de_vocab['<unk>'] = 1
de_vocab['<sos>'] = 2
de_vocab['<eos>'] = 3

en_inputs = []
de_inputs = []

en_val = []
de_val = []

for sent in train_en:
    en_idxes = []
    for i, w in enumerate(sent):
        if w in en_uncommon:
            sent[i] = '<unk>'
            del en_uncommon[w]
        elif w not in en_vocab:
            en_vocab[w] = len(en_vocab)
        en_idxes.append(en_vocab[sent[i]])
    en_inputs.append(en_idxes)
del train_en

for sent in dev_en:
    en_idxes = []
    for i, w in enumerate(sent):
        if w in en_uncommon:
            sent[i] = '<unk>'
            del en_uncommon[w]
        elif w not in en_vocab:
            en_vocab[w] = len(en_vocab)
        en_idxes.append(en_vocab[sent[i]])
    en_val.append(en_idxes)
del dev_en

for sent in train_de:
    de_idxes = []
    for i, w in enumerate(sent):
        if w in de_uncommon:
            sent[i] = '<unk>'
            del de_uncommon[w]
        elif w not in de_vocab:
            de_vocab[w] = len(de_vocab)
        de_idxes.append(de_vocab[sent[i]])
    de_inputs.append(de_idxes)
del train_de

for sent in dev_de:
    de_idxes = []
    for i, w in enumerate(sent):
        if w in de_uncommon:
            sent[i] = '<unk>'
            del de_uncommon[w]
        elif w not in de_vocab:
            de_vocab[w] = len(de_vocab)
        de_idxes.append(de_vocab[sent[i]])
    de_val.append(de_idxes)
del dev_de

In [12]:
# vocab={}
# vocab['<pad>'] = 0
# vocab['<unk>'] = 1 
# vocab['<sos>'] = 2
# vocab['<eos>'] = 3

# uncommon = {'then':0, 'thing':1}

# sample = ['<sos>', 'the', 'brown', 'cow', 'then', 'jumped', '<eos>']

# sidx = []

# for i, w in enumerate(sample):
#     if w in uncommon:
#         sample[i] = '<unk>'
#         del uncommon[w]
#     elif w not in vocab:
#         vocab[w] = len(vocab)
#     sidx.append(vocab[sample[i]])  
    
# print(vocab)
# print(uncommon)
# print(sample)
# print(sidx)

{'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3, 'the': 4, 'brown': 5, 'cow': 6, 'jumped': 7}
{'thing': 1}
['<sos>', 'the', 'brown', 'cow', '<unk>', 'jumped', '<eos>']
[2, 4, 5, 6, 1, 7, 3]


## Print examples of words that are infrequently used

In [32]:
print(len(en_uncommon), "of the", len(en_freq), "english words are only used once")
print(len(de_uncommon), "of the", len(de_freq), "german words are only used once")

en_idx_to_word = dict((v,k) for k,v in en_vocab.items())
de_idx_to_word = dict((v,k) for k,v in de_vocab.items())

for i in range(10):
    idx = en_uncommon[random.randint(0,(len(en_uncommon)))]
    print(en_idx_to_word[idx])
    
for i in range(10):
    idx = de_uncommon[random.randint(0,(len(de_uncommon)))]
    print(de_idx_to_word[idx])

23267 of the 58510 english words are only used once
71615 of the 129363 german words are only used once
cmos
super-cooperators
dnc
weighting
leat
herblock
you.
partible
trifling
cycads
kommunikationsgeräte
endozytose
rosabeth
unerschrockene
falschlaufenden
konsummiert
steuersätze
ghul
wirtschaftskräfte
verteilungsseite


## Remove infrequent words from the dictionary

In [8]:
# en_idx_to_word = dict((v,k) for k,v in en_vocab.items())
# de_idx_to_word = dict((v,k) for k,v in de_vocab.items())       

# print("english training")
# for sent in en_inputs:
#     for i, idx in enumerate(sent):
#         if idx in en_uncommon:
#             sent[i] = 1
#             del en_idx_to_word[idx]
        
# print("english validation")
# for sent in en_val:
#     for i, idx in enumerate(sent):
#         if idx in en_uncommon:
#             sent[i] = 1
#             del en_idx_to_word[idx]

# print("german training")
# for sent in de_inputs:
#     for i, idx in enumerate(sent):
#         if idx in de_uncommon:
#             sent[i] = 1
#             del de_idx_to_word[idx]
        
# print("german validation")
# for sent in de_val:
#     for i, idx in enumerate(sent):
#         if idx in de_uncommon:
#             sent[i] = 1
#             del de_idx_to_word[idx]            


english training
english validation
german training


KeyboardInterrupt: 

# ========== Warning ==========

## The following code cell sorts the train and validation sets by the length, (smallest to largest)

In [10]:
inplengths = [len(x) for x in de_inputs]
vallengths = [len(x) for x in de_val]

sorted_indices_inp = np.argsort(inplengths)
sorted_indices_val = np.argsort(vallengths)

de_inputs_sorted = []
en_inputs_sorted = []
de_val_sorted = []
en_val_sorted = []

for s in sorted_indices_inp:
    de_inputs_sorted.append(de_inputs[s])
    en_inputs_sorted.append(en_inputs[s])

for s in sorted_indices_val:
    de_val_sorted.append(de_val[s])
    en_val_sorted.append(en_val[s])

de_inputs = de_inputs_sorted
en_inputs = en_inputs_sorted
de_val = de_val_sorted
en_val = en_val_sorted

In [11]:
print("longest sentence", len(de_inputs[-1]))

longest sentence 768


## Check to see if the sorted data is correct.


**en_inputs[112862] and de_inputs[112862] should give:**

- <sos> i need your surname . symmetrical objects generally -- spell it for me . <eos> 
    
- <sos> ich brauche ihren nachnamen . symmetrische objekte haben grundsätzlich – bitte buchstabieren sie ihn für mich . <eos> 
    
**en_val[3021] and de_val[3021] should give:**

- <sos> but where would be the nearest aed to help this patient ? <eos> 
    
- <sos> aber wo wäre der nächste aed , um diesem patienten zu helfen ? <eos>     
    

In [10]:
# DEPRECATED EXAMPLE

# en_idx_to_word = dict((v,k) for k,v in en_vocab.items())
# de_idx_to_word = dict((v,k) for k,v in de_vocab.items())

# for i in en_inputs[112862]:
#     print(en_idx_to_word[i], end=' ')

# print("")

# for j in de_inputs[112862]:
#     print(de_idx_to_word[j], end=' ')
    
# print("")
    
# for i in en_val[3021]:
#     print(en_idx_to_word[i], end=' ')

# print("")

# for j in de_val[3021]:
#     print(de_idx_to_word[j], end=' ')
    

# ========== Warning ==========

## The following code cell adds padding to all of the sentences in both the training and validation datasets

In [11]:
# DEPRECATED METHOD

# for i, sentence in enumerate(en_inputs):
#     diff = max_length_en - len(sentence)
#     if diff == 0:
#         continue
#     pad_array = [0]*diff
#     sentence = sentence + pad_array
#     en_inputs[i] = sentence

# for i, sentence in enumerate(de_inputs):
#     diff = max_length_de - len(sentence)
#     if diff == 0:
#         continue
#     pad_array = [0]*diff
#     sentence = sentence + pad_array
#     de_inputs[i] = sentence
    
# for i, sentence in enumerate(en_val):
#     diff = max_length_en_dev - len(sentence)
#     if diff == 0:
#         continue
#     pad_array = [0]*diff
#     sentence = sentence + pad_array
#     en_val[i] = sentence

# for i, sentence in enumerate(de_val):
#     diff = max_length_de_dev - len(sentence)
#     if diff == 0:
#         continue
#     pad_array = [0]*diff
#     sentence = sentence + pad_array
#     de_val[i] = sentence

# ========== Warning ==========

## The following code cell shrinks the number of training sentences from 196k to 50k

In [12]:
# DEPRECATED METHOD
# 
# en_inputs = en_inputs[:50000]
# de_inputs = de_inputs[:50000]
# print(len(en_inputs))

## The following block saves the data that has been processed

In [10]:
en_idx_to_word = dict((v,k) for k,v in en_vocab.items())
de_idx_to_word = dict((v,k) for k,v in de_vocab.items())

en_iwslt = {}
de_iwslt = {}

en_iwslt['idx2word'] = en_idx_to_word  
de_iwslt['idx2word'] = de_idx_to_word

en_iwslt['train'] = en_inputs
de_iwslt['train'] = de_inputs

en_iwslt['dev'] = en_val
de_iwslt['dev'] = de_val

with open('data/processed/english_unk_sorted_100k.pickle', 'wb') as handle:
    pickle.dump(en_iwslt, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('data/processed/german_unk_sorted_100k.pickle', 'wb') as handle:
    pickle.dump(de_iwslt, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
print("German vocab length", len(de_iwslt['idx2word']))
print("English vocab length", len(en_iwslt['idx2word']))

German vocab length 8025
English vocab length 6028


## Examples of how to access some of the data

In [14]:
print(de_val[44])
print(len(en_vocab))
print(len(de_vocab))
print(en_vocab['like'])

for i in range(1): 
    for j in range(len(de_val[i])):
        print(de_idx_to_word[de_val[i][j]], end=" ")
    print("\n")


[2, 13736, 1101, 12731, 5770, 13990, 14, 3]
10386
14294
233
<sos> danke . <eos> 



## Check to make sure that the data has been saved correctly (make sure that the filenames match). If you get two Trues, the test is passed.

In [13]:
with open('data/processed/english_unk_sorted_100k.pickle', 'rb') as handle:
    english = pickle.load(handle)
    
with open('data/processed/german_unk_sorted_100k.pickle', 'rb') as handle:
    german = pickle.load(handle)
    
print(en_iwslt == english)
print(de_iwslt == german)

True
True


In [13]:
print(german['train'][0])

[2, 865, 3]


In [14]:
de_inputs[0]

[2, 865, 3]