In [1]:
import numpy as np
import pandas as pd
import keras
from keras.models import Model
from keras.layers import Input,Dense,LSTM,Bidirectional,Embedding
from keras.callbacks import ModelCheckpoint
import h5py

Using TensorFlow backend.


In [2]:
samples = 10000
path = 'fra.txt'
df = pd.read_csv(path,sep='\t', index_col = False)
df.columns = ['en','fr']
df.head()

Unnamed: 0,en,fr
0,Run!,Courez !
1,Wow!,Ça alors !
2,Fire!,Au feu !
3,Help!,À l'aide !
4,Jump.,Saute.


In [3]:
print(len(df))

145435


In [4]:
en = np.array(df['en'])[:samples]
fr = np.array(df['fr'])[:samples]

for i in range(len(fr)):
     fr[i]= '\t' + fr[i] + '\n'

In [5]:
#char-to-char
en_char_set = set()
fr_char_set = set()

for sent in en:
    for char in sent:
        en_char_set.add(char)
    
for sent in fr:
    for char in sent:
        fr_char_set.add(char)

In [6]:
en_char = sorted(list(en_char_set))
fr_char = sorted(list(fr_char_set))
len(fr_char)

92

In [7]:
encoder_tokens_len = len(en_char)
decoder_tokens_len = len(fr_char)

encoder_seq_length = max([len(line) for line in en])
decoder_seq_length = max([len(line) for line in fr])

In [8]:
en_char_idx = dict([(char,i) for i,char in enumerate(en_char)])
en_idx_char = dict([(i,char) for i,char in enumerate(en_char)])
fr_char_idx = dict([(char,i) for i,char in enumerate(fr_char)])
fr_idx_char = dict([(i,char) for i,char in enumerate(fr_char)])

fr_char_idx['\n']

1

In [9]:
encoder_input_data = np.zeros((len(en),encoder_seq_length,encoder_tokens_len),dtype='float32')
decoder_input_data = np.zeros((len(fr),decoder_seq_length,decoder_tokens_len),dtype='float32')
decoder_output_data = np.zeros((len(fr),decoder_seq_length,decoder_tokens_len),dtype='float32')

for i, (input_text,target_text) in enumerate(zip(en,fr)):
    for t,char in enumerate(input_text):
        encoder_input_data[i,t,en_char_idx[char]]=1
    for t,char in enumerate(target_text):
        decoder_input_data[i,t,fr_char_idx[char]]=1
        if t>0:
            decoder_output_data[i,t-1,fr_char_idx[char]]=1
        

In [10]:
units = 128
epochs = 100
batch_size = 256

In [11]:
#char-to-char model
encoder_input = Input(shape=(None,encoder_tokens_len))
encoder = LSTM(units,return_state=True)
encoder_output,state_h,state_c = encoder(encoder_input)

encoder_states = [state_h,state_c]

decoder_input = Input(shape=(None,decoder_tokens_len))
decoder = LSTM(units,return_sequences=True,return_state=True)
decoder_output,_,_ = decoder(decoder_input,initial_state=encoder_states)

decoder_dense = Dense(decoder_tokens_len,activation='softmax')
decoder_output = decoder_dense(decoder_output)

model = Model([encoder_input,decoder_input],decoder_output)

In [12]:
model.compile(loss='categorical_crossentropy',optimizer='adam')

In [13]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, None, 71)     0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, None, 92)     0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, 128), (None, 102400      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_2 (LSTM)                   [(None, None, 128),  113152      input_2[0][0]                    
                                                                 lstm_1[0][1]                     
          

In [18]:
text_model = model.fit([encoder_input_data,decoder_input_data],decoder_output_data,batch_size=batch_size,epochs=epochs, validation_split=0.2)

Train on 8000 samples, validate on 2000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
1024/8000 [==>...........................] - ETA: 37s - loss: 0.2490

KeyboardInterrupt: 

In [19]:
encoder_model_inf = Model(encoder_input, encoder_states)

decoder_state_input_h = Input(shape=(128,))
decoder_state_input_c = Input(shape=(128,))
decoder_input_states = [decoder_state_input_h, decoder_state_input_c]

decoder_out, decoder_h, decoder_c = decoder(decoder_input, 
                                                 initial_state=decoder_input_states)

decoder_states = [decoder_h , decoder_c]

decoder_out = decoder_dense(decoder_out)

decoder_model_inf = Model(inputs=[decoder_input] + decoder_input_states,
                          outputs=[decoder_out] + decoder_states )

In [20]:
def decode_seq(inp_seq):
    
    states_val = encoder_model_inf.predict(inp_seq)
    
    target_seq = np.zeros((1, 1, decoder_tokens_len))
    target_seq[0, 0, fr_char_idx['\t']] = 1
    
    translated_sent = ''
    stop_condition = False
    
    while not stop_condition:
        
        decoder_out, decoder_h, decoder_c = decoder_model_inf.predict(x=[target_seq] + states_val)
        
        max_val_index = np.argmax(decoder_out[0,-1,:])
        sampled_fra_char = fr_idx_char[max_val_index]
        translated_sent += sampled_fra_char
        
        if ( (sampled_fra_char == '\n') or (len(translated_sent) > decoder_seq_length)) :
            stop_condition = True
        
        target_seq = np.zeros((1, 1, decoder_tokens_len))
        target_seq[0, 0, max_val_index] = 1
        
        states_val = [decoder_h, decoder_c]
        
    return translated_sent

In [25]:
for seq_index in range(150,170):
    inp_seq = encoder_input_data[seq_index:seq_index+1]
    translated_sent = decode_seq(inp_seq)
    print('-')
    print('Input sentence:', en[seq_index])
    print('Decoded sentence:', translated_sent)

-
Input sentence: Open up.
Decoded sentence: Ouvre-toi !

-
Input sentence: Open up.
Decoded sentence: Ouvre-toi !

-
Input sentence: Perfect!
Decoded sentence: Vailui !

-
Input sentence: See you.
Decoded sentence: À plus !

-
Input sentence: Show me.
Decoded sentence: Montre-moi !

-
Input sentence: Show me.
Decoded sentence: Montre-moi !

-
Input sentence: Shut up!
Decoded sentence: Ferme-le !

-
Input sentence: Shut up!
Decoded sentence: Ferme-le !

-
Input sentence: Shut up!
Decoded sentence: Ferme-le !

-
Input sentence: Shut up!
Decoded sentence: Ferme-le !

-
Input sentence: Shut up!
Decoded sentence: Ferme-le !

-
Input sentence: So long.
Decoded sentence: Tais !

-
Input sentence: Take it.
Decoded sentence: Prenez !

-
Input sentence: Take it.
Decoded sentence: Prenez !

-
Input sentence: Tell me.
Decoded sentence: Dis-moi !

-
Input sentence: Tell me.
Decoded sentence: Dis-moi !

-
Input sentence: Tom won.
Decoded sentence: Tom a dichon.

-
Input sentence: Wake up!
Decoded s