In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from __future__ import print_function

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Embedding
import numpy as np 
import pandas as pd
import re
import string

In [127]:
batch_size = 32  # Batch size for training.
epochs = 100  # Number of epochs to train for.
latent_dim = 50  # Latent dimensionality of the encoding space.
num_samples = 10000  # Number of samples to train on.

In [80]:
lines = pd.read_table('/content/drive/MyDrive/Colab Notebooks/deep_learning_studia/rnn/pol.txt', names=['eng', 'pol', 'contributor'])

In [81]:
lines

Unnamed: 0,eng,pol,contributor
0,Go.,Idź.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Hi.,Cześć.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...
2,Run!,Uciekaj!,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
3,Run.,Biegnij.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
4,Run.,Uciekaj.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
...,...,...,...
40460,No matter how much you try to convince people ...,"Nieważne, jak bardzo usiłujesz przekonać ludzi...",CC-BY 2.0 (France) Attribution: tatoeba.org #9...
40461,A child who is a native speaker usually knows ...,Dziecko zwykle wie o swoim języku ojczystym rz...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
40462,Since there are usually multiple websites on a...,Zwykle jest wiele stron internetowych na każdy...,CC-BY 2.0 (France) Attribution: tatoeba.org #9...
40463,"If you want to sound like a native speaker, yo...","Jeśli chcesz mówić jak rodzimy użytkownik, mus...",CC-BY 2.0 (France) Attribution: tatoeba.org #9...


In [82]:
lines = lines.drop(columns=['contributor'])

In [83]:
lines = lines[0:10000]

In [84]:
lines

Unnamed: 0,eng,pol
0,Go.,Idź.
1,Hi.,Cześć.
2,Run!,Uciekaj!
3,Run.,Biegnij.
4,Run.,Uciekaj.
...,...,...
9995,Where is your school?,Gdzie jest twoja szkoła?
9996,Where's my breakfast?,Gdzie jest moje śniadanie?
9997,Where's the bathroom?,Gdzie jest toaleta?
9998,Where's the bus stop?,Gdzie znajduje się przystanek autobusowy?


In [85]:
# Lowercase characters
lines.eng=lines.eng.apply(lambda x: x.lower())
lines.pol=lines.pol.apply(lambda x: x.lower())

In [86]:
# Take the length as 50
lines.eng=lines.eng.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))
lines.pol=lines.pol.apply(lambda x: re.sub("'", '', x)).apply(lambda x: re.sub(",", ' COMMA', x))

In [87]:
exclude = set(string.punctuation)
lines.eng=lines.eng.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines.pol=lines.pol.apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [88]:
# Remove digits
remove_digits = str.maketrans('', '', string.digits)
lines.eng=lines.eng.apply(lambda x: x.translate(remove_digits))
lines.pol=lines.pol.apply(lambda x: x.translate(remove_digits))

In [89]:
lines.head()

Unnamed: 0,eng,pol
0,go,idź
1,hi,cześć
2,run,uciekaj
3,run,biegnij
4,run,uciekaj


In [90]:
# Add start and end tokens to target sequences
lines.pol = lines.pol.apply(lambda x : 'START_ '+ x + ' _END')

In [91]:
# Vocabulary of English
all_eng_words=set()
for eng in lines.eng:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

In [92]:
# Vocabulary of Polish
all_pol_words=set()
for pol in lines.pol:
    for word in pol.split():
        if word not in all_pol_words:
            all_pol_words.add(word)

In [93]:
len(all_eng_words), len(all_pol_words)

(3119, 6139)

In [94]:
# Max Length of source sequence
lenght_list=[]
for l in lines.eng:
    lenght_list.append(len(l.split(' ')))
max_length_eng = np.max(lenght_list)

In [95]:
# Max Length of target sequence
lenght_list=[]
for l in lines.pol:
    lenght_list.append(len(l.split(' ')))
max_length_pol = np.max(lenght_list)

In [96]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_pol_words))

In [97]:
# Calculate Vocab size for both source and target
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_pol_words)
num_decoder_tokens += 1 # For zero padding

In [98]:
# Create word to token dictionary for both source and target
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [99]:
len(lines.pol)*16*num_decoder_tokens

982400000

In [100]:
lines.sample(10)

Unnamed: 0,eng,pol
1850,bring it back,START_ oddaj to _END
5032,you were invited,START_ zostałeś zaproszony _END
9722,time was running out,START_ czas się kończył _END
1529,is that snow,START_ czy to jest śnieg _END
3306,how is your dad,START_ jak się miewa twój tata _END
1210,tom cheered,START_ tom wiwatował _END
8629,toms eyes were red,START_ tom miał zaczerwienione oczy _END
124,i smoke,START_ palę _END
4940,what did she say,START_ co ona powiedziała _END
4459,i prefer reading,START_ wolę czytać _END


In [101]:
encoder_input_data = np.zeros((len(lines.eng), 7), dtype='float32')
decoder_input_data = np.zeros((len(lines.pol), 16), dtype='float32')
decoder_target_data = np.zeros((len(lines.pol), 16, num_decoder_tokens), dtype='float32')

In [102]:
for i, (input_text, target_text) in enumerate(zip(lines.eng, lines.pol)):
    for t, word in enumerate(input_text.split()):
        encoder_input_data[i, t] = input_token_index[word]
    for t, word in enumerate(target_text.split()):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] = target_token_index[word]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1, target_token_index[word]] = 1.

In [103]:
encoder_input_data

array([[1149.,    0.,    0., ...,    0.,    0.,    0.],
       [1276.,    0.,    0., ...,    0.,    0.,    0.],
       [2258.,    0.,    0., ...,    0.,    0.,    0.],
       ...,
       [3017., 2721.,  235., ...,    0.,    0.,    0.],
       [3017., 2721.,  390., ...,    0.,    0.,    0.],
       [3017., 2721.,  390., ...,    0.,    0.,    0.]], dtype=float32)

In [132]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens+1, latent_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens+1, latent_dim)
dec_emb = dec_emb_layer(decoder_inputs)

decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=encoder_states)

# Use a softmax to generate a probability distribution over the target vocabulary for each time step
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turnA
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])

In [133]:
model.summary()

Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_35 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
input_36 (InputLayer)           [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding_22 (Embedding)        (None, None, 50)     156000      input_35[0][0]                   
__________________________________________________________________________________________________
embedding_23 (Embedding)        (None, None, 50)     307050      input_36[0][0]                   
___________________________________________________________________________________________

In [134]:
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=0.2)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7f83d0d0f668>

In [135]:
# define the encoder model 
encoder_model = Model(encoder_inputs, encoder_states)
encoder_model.summary()
# Redefine the decoder model with decoder will be getting below inputs from encoder while in prediction
decoder_state_input_h = Input(shape=(50,))
decoder_state_input_c = Input(shape=(50,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
final_dex2= dex(decoder_inputs)
decoder_outputs2, state_h2, state_c2 = decoder_lstm(final_dex2, initial_state=decoder_states_inputs)
decoder_states2 = [state_h2, state_c2]
decoder_outputs2 = decoder_dense(decoder_outputs2)
# sampling model will take encoder states and decoder_input(seed initially) and output the predictions(french word index) We dont care about decoder_states2
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs2] + decoder_states2)
# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict(
    (i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict(
    (i, char) for char, i in target_token_index.items())

Model: "model_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_35 (InputLayer)        [(None, None)]            0         
_________________________________________________________________
embedding_22 (Embedding)     (None, None, 50)          156000    
_________________________________________________________________
lstm_22 (LSTM)               [(None, 50), (None, 50),  20200     
Total params: 176,200
Trainable params: 176,200
Non-trainable params: 0
_________________________________________________________________


In [136]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1,1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index['START_']
# Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
# Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += ' '+sampled_char
# Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '_END' or
           len(decoded_sentence) > 52):
            stop_condition = True
# Update the target sequence (of length 1).
        target_seq = np.zeros((1,1))
        target_seq[0, 0] = sampled_token_index
# Update states
        states_value = [h, c]
    return decoded_sentence

In [140]:
for seq_index in [1414, 304, 4231, 8506, 7348, 6789, 5678]:
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index: seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print('-')
    print('Input sentence:', lines.eng[seq_index: seq_index + 1])
    print('Decoded sentence:', decoded_sentence)

-
Input sentence: 1414    have courage
Name: eng, dtype: object
Decoded sentence:  poddawaj piłkę piłkę czytać iść pani _END
-
Input sentence: 304    try some
Name: eng, dtype: object
Decoded sentence:  moim to się pan do do do do do do do do do do do do do
-
Input sentence: 4231    everyone cheered
Name: eng, dtype: object
Decoded sentence:  okazji powietrza umrzeć pani pani _END
-
Input sentence: 8506    tom has a chauffeur
Name: eng, dtype: object
Decoded sentence:  łódkę życie jak iść pani _END
-
Input sentence: 7348    were closed today
Name: eng, dtype: object
Decoded sentence:  dwójka spójrz list iść iść do _END
-
Input sentence: 6789    is that your house
Name: eng, dtype: object
Decoded sentence:  twoim twoim mało mało lunch _END
-
Input sentence: 5678    she isnt married
Name: eng, dtype: object
Decoded sentence:  nie ma ma to się do _END
