## Language Translator

In [1]:
import nltk

From `nltk` we can download translated sentences between different languages. You can see the example between **English and French** below but feel free to try different combination as well.

In [2]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     C:\Users\Capt.Otaku\AppData\Roaming\nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[0])

<AlignedSent: 'Resumption of the se...' -> 'Reprise de la sessio...'>


In [4]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [5]:
# 22 // 9

## Build
[Guide 1](https://hub.packtpub.com/create-an-rnn-based-python-machine-translation-system-tutorial/), 
[Guide 2](https://towardsdatascience.com/language-translation-with-rnns-d84d43b40571),
[Guide from mentor](https://www.analyticsvidhya.com/blog/2019/01/neural-machine-translation-keras/)

#### 1. Data Extraction

In [6]:
from nltk.corpus import comtrans
import string

In [7]:
# define extract function
def extract_corpus(translated_pairs='alignment-en-fr.txt'):
    als = comtrans.aligned_sents(translated_pairs)
    sent1 = [sent.words for sent in als]
    sent2 = [sent.mots for sent in als]
    return sent1, sent2

In [8]:
# extract and save corpus
source, target = extract_corpus()

#### 2. Data Cleaning

In [9]:
# convert to lowercase
def to_lower(sentence):
    lower = [word.lower() for word in sentence]
    return lower

# remove punctuations
def remove_punc(sentence):
    table = str.maketrans('', '', string.punctuation)
    stripped = [word.translate(table) for word in sentence]
    return stripped

# remove empty strings
def remove_empty(sentence):
    full = [word for word in sentence if len(word) >= 1]
    return full

# combine all cleaning methods above
def preprocess(sentence):
    prep = to_lower(sentence)
    prep = remove_punc(prep)
    prep = remove_empty(prep)
    return prep

In [10]:
# preprocess
clean_source = []
clean_target = []

for sent in source:
    clean_source.append(preprocess(sent))
for sent in target:
    clean_target.append(preprocess(sent))

In [11]:
# extract desire length indices
def keep_index(sentences, maxlen=20):
    keep = []
    for index, sentence in enumerate(sentences):
        if len(sentence) <= 20:
            keep.append(index)
    return keep

# filter out long sentences to lower computation level
def cutoff(sentences, indices):
    cut = []
    for index in indices:
        cut.append(sentences[index])
    return cut

In [12]:
# limit sentence length to 20 words
keep_id = keep_index(clean_source)
cut_source = cutoff(clean_source, keep_id)
cut_target = cutoff(clean_target, keep_id)

keep_id = keep_index(cut_target)
cut_source = cutoff(cut_source, keep_id)
cut_target = cutoff(cut_target, keep_id)

#### 3. Data Transformation

In [13]:
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer

In [14]:
def tokenize(sentences):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(sentences)
    return tokenizer.texts_to_sequences(sentences), tokenizer

def padding(source):
    return preprocessing.sequence.pad_sequences(source) # need maxlen?

In [15]:
# transform words to unique numeric ID
source_tokenized, source_tokenizer = tokenize(cut_source)
target_tokenized, target_tokenizer = tokenize(cut_target)

# add padding
pad_source = padding(source_tokenized)
pad_target = padding(target_tokenized)

In [32]:
len(target_tokenizer.word_index)

13988

#### Modeling (RNN)

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, LSTM

In [17]:
# reshape for LSTM input
input_tokens = pad_source.reshape(*pad_source.shape, 1)
output_tokens = pad_target.reshape(*pad_target.shape, 1)

In [45]:
input_tokens.shape

(16834, 20, 1)

In [82]:
model = Sequential()
# model.add(Embedding(input_dim=len(target_tokenizer.word_index), output_dim=20))
model.add(LSTM(256, return_sequences=True, input_shape=input_tokens.shape[1:]))
model.add(Dense(128, activation='relu'))
model.add(Dense(1, activation='softmax'))
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

In [83]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 20, 256)           264192    
_________________________________________________________________
dense (Dense)                (None, 20, 128)           32896     
_________________________________________________________________
dense_1 (Dense)              (None, 20, 1)             129       
Total params: 297,217
Trainable params: 297,217
Non-trainable params: 0
_________________________________________________________________


#### Testing

In [88]:
history = model.fit(input_tokens, output_tokens, 
                    epochs=10, 
                    batch_size=1024,
                    validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [81]:
clear_session()

#### Convert RNN Results back to Translated Text

In [26]:
def logits_to_text(logits, tokenizer):
    """
    Turn logits from a neural network into text using the tokenizer
    :param logits: Logits from a neural network
    :param tokenizer: Keras Tokenizer fit on the labels
    :return: String that represents the text of the logits
    """
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

print('`logits_to_text` function loaded.')

`logits_to_text` function loaded.


In [60]:
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

In [61]:
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

In [24]:
predict_sequence()

TypeError: predict_sequence() missing 3 required positional arguments: 'model', 'tokenizer', and 'source'

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent2)
tokenizer.word_index

In [None]:
def id_to_word(RNN_result, tokenizer):
    pass

In [None]:
# online model
# def define_model(in_vocab, out_vocab, in_timesteps,out_timesteps, units):
#     model = Sequential()
#     model.add(Embedding(in_vocab, units, input_length=in_timesteps, mask_zero=True))
#     model.add(LSTM(units))
#     model.add(RepeatVector(out_timesteps))
#     model.add(LSTM(units, return_sequences=True))
#     model.add(Dense(out_vocab, activation='softmax'))
#     return model

In [None]:
# model = define_model(14662, 14662, 20, 20, 512)

In [None]:
# model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [None]:
# history = model.fit(pad_token1, pad_token2.reshape(pad_token2.shape[0], pad_token2.shape[1], 1),
#                     epochs=30, batch_size=512, verbose=1)

In [None]:
# preds = model.predict_classes(testX.reshape((testX.shape[0],testX.shape[1])))