In [1]:
import numpy as np
import pandas as pd
import keras
from keras.models import Model
from keras.layers import Input,Dense,LSTM,Bidirectional,Embedding,TimeDistributed,RepeatVector,Flatten
from keras.callbacks import ModelCheckpoint
import h5py

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [23]:
samples = 100000
path = 'fra.txt'
df = pd.read_csv(path,sep='\t', index_col = False)
df.columns = ['en','fr']
df.head()
print(len(df))

145435


In [56]:
en = np.array(df['en'])[:samples]
fr = np.array(df['fr'])[:samples]

print(en[-5:],fr[-5:])
print(len(en))

['My aunt inherited the huge estate.' 'My belief is that he will succeed.'
 'My best friend dances really well.' 'My best friend stole my boyfriend.'
 'My boss is keeping me pretty busy.'] ["Ma tante a hérité de l'immense propriété."
 "Je crois qu'il connaîtra le succès."
 'Mon meilleur ami danse vraiment bien.'
 "Ma meilleure amie m'a piqué mon petit copain."
 'Mon patron ne me laisse pas chômer.']
100000


In [39]:
en_word_set = set()
fr_word_set = set()

for sent in en:
    for word in sent.split():
        en_word_set.add(word)
    
for sent in fr:
    for word in sent.split():
        fr_word_set.add(word)
print(en[100].split())

['Go', 'away!']


In [40]:
en_word = sorted(list(en_word_set))
fr_word = sorted(list(fr_word_set))
print(len(fr_word))

28656


In [42]:
encoder_tokens_len = len(en_word)
decoder_tokens_len = len(fr_word)

encoder_seq_length = max([len(line.split()) for line in en])
decoder_seq_length = max([len(line.split()) for line in fr])

print(decoder_seq_length)
print(encoder_seq_length)

16
10


In [28]:
en_word_idx = dict([(word,i) for i,word in enumerate(en_word)])
fr_word_idx = dict([(word,i) for i,word in enumerate(fr_word)])

In [48]:
# encoder_input_data = np.zeros((len(en),encoder_seq_length,encoder_tokens_len),dtype='float32')
# decoder_input_data = np.zeros((len(fr),decoder_seq_length,decoder_tokens_len),dtype='float32')
# decoder_output_data = np.zeros((len(fr),decoder_seq_length,decoder_tokens_len),dtype='float32')

encoder_input_data = np.zeros((len(en),encoder_seq_length),dtype='float32')
decoder_input_data = np.zeros((len(fr),decoder_seq_length),dtype='float32')
decoder_output_data = np.zeros((len(fr),decoder_seq_length),dtype='float32')

for i, (input_text,target_text) in enumerate(zip(en,fr)):
    for t,word in enumerate(input_text.split()):
        encoder_input_data[i,t]=en_word_idx[word]
    for t,word in enumerate(target_text.split()):
        decoder_input_data[i,t]=fr_word_idx[word]
        if t>0:
            decoder_output_data[i,t-1]=fr_word_idx[word]
print(decoder_output_data.shape)
        

(100000, 16)


In [30]:
units = 128
epochs = 10
batch_size = 256

In [49]:
encoder_input = Input(shape=(encoder_seq_length,))
encoder_embed = Embedding(encoder_tokens_len,units,input_length=encoder_seq_length)(encoder_input)
encoder_bi = Bidirectional(LSTM(units,return_sequences = True))
encoder_output = encoder_bi(encoder_embed)
encoder = LSTM(units,return_state=True)
encoder,state_h,state_c = encoder(encoder_output)

encoder_states = [state_h,state_c]

# decoder_input = Input(shape=(decoder_seq_length,))
# decoder_embed = Embedding(decoder_tokens_len,units,input_length=decoder_seq_length)(decoder_input)
decoder = RepeatVector(decoder_seq_length)(encoder) #LSTM(units,return_sequences = True,return_state = True)
decoder = LSTM(units,return_sequences=True)(decoder)
decoder_output = TimeDistributed(Dense(decoder_seq_length,activation='relu'))(decoder)
decoder_output = Flatten()(decoder_output)
decoder_output = Dense(decoder_seq_length,activation='relu')(decoder_output)
# decoder_output,_,_ = decoder(decoder_embed,initial_state=encoder_states)
# decoder_dense = TimeDistributed(Dense(decoder_tokens_len,activation='softmax'))
# decoder_output = decoder_dense(decoder_output)

model = Model(encoder_input,decoder_output)

In [53]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [54]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 10)                0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 10, 128)           2118400   
_________________________________________________________________
bidirectional_7 (Bidirection (None, 10, 256)           263168    
_________________________________________________________________
lstm_20 (LSTM)               [(None, 128), (None, 128) 197120    
_________________________________________________________________
repeat_vector_7 (RepeatVecto (None, 16, 128)           0         
_________________________________________________________________
lstm_21 (LSTM)               (None, 16, 128)           131584    
_________________________________________________________________
time_distributed_7 (TimeDist (None, 16, 16)            2064      
__________

In [55]:
checkpoint = ModelCheckpoint('check01',monitor='val_acc',verbose=1, save_best_only=True,mode='max')
callbacks_list=[checkpoint]
text_model = model.fit([encoder_input_data],decoder_output_data,batch_size=batch_size,epochs=epochs,verbose=1, validation_split=0.2,callbacks=callbacks_list)

Train on 80000 samples, validate on 20000 samples
Epoch 1/10

Epoch 00001: val_acc improved from -inf to 0.19460, saving model to check01
Epoch 2/10

Epoch 00002: val_acc improved from 0.19460 to 0.20260, saving model to check01
Epoch 3/10

Epoch 00003: val_acc did not improve from 0.20260
Epoch 4/10

Epoch 00004: val_acc did not improve from 0.20260
Epoch 5/10

KeyboardInterrupt: 