In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GRU, LSTM, Dense, TimeDistributed, Input, RepeatVector, Embedding
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
en=pd.read_csv('small_vocab_en.csv', header=None, sep='\t')
fr=pd.read_csv('small_vocab_fr.csv', header=None, sep='\t')

en=en.rename(columns={0:"English"})
fr=fr.rename(columns={0:"French"})

df = en.join(fr)

In [3]:
df['French'] = df['French'].apply(lambda x: ' '.join(['sos', x, 'eos']))

In [4]:
train_en, test_en, train_fr, test_fr = train_test_split(df['English'], df['French'], test_size=0.2)

In [5]:
en_token = text.Tokenizer()
fr_token = text.Tokenizer()
en_token.fit_on_texts(list(train_en))
fr_token.fit_on_texts(list(train_fr))

In [21]:
def preprocess(series,tokenizer, vocab, maxlen, reverse=False, train=False):
    seq = tokenizer.texts_to_sequences(series)
    
    if train:
        vocab = len(tokenizer.word_index) + 1
        maxlen = max([len(sen) for sen in seq])
    
    seq = sequence.pad_sequences(seq, maxlen=maxlen, padding='post')
    if reverse:
        seq = seq[:, ::-1]
    return (seq, vocab, maxlen)

In [7]:
en_seq, en_vocab, en_len = preprocess(train_en, en_token, reverse=True)
fr_seq, fr_vocab, fr_len = preprocess(train_fr, fr_token)

In [None]:
fr_x = fr_seq[:,:-1]
fr_cat = to_categorical(fr_seq, num_classes=fr_vocab)
fr_y = fr_cat[:,1:,:]

In [10]:
#encoder
en_inp = Input(shape=(en_len,))
tr_en_emb = Embedding(en_vocab, 96, input_length=en_len)
tr_emb = tr_en_emb(en_inp)
tr_en_gru = GRU(48, return_state=True)
_, en_state = tr_en_gru(tr_emb)

#decoder
de_inp = Input(shape=(fr_len-1,))
tr_de_emb = Embedding(fr_vocab, 96, input_length=fr_len-1)
tr_de_emb_ = tr_de_emb(de_inp)
tr_de_gru = GRU(48, return_sequences=True)
de_out = tr_de_gru(tr_de_emb_, initial_state=en_state)

#prediction
tr_dense = Dense(fr_vocab, activation='softmax')
de_pred = TimeDistributed(tr_dense)(de_out)

model = Model(inputs=[en_inp, de_inp], outputs=de_pred, name="TeacherForcing")
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

Model: "TeacherForcing"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 22)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 15, 96)       19200       ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 22, 96)       33216       ['input_2[0][0]']                
                                                                                     

In [11]:
# Implement callbacks to handle overfitting
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
model_save = ModelCheckpoint('best_model.hdf5', save_best_only=True)

history = model.fit([en_seq, fr_x], fr_y, batch_size=64, epochs=30, validation_split=0.2, callbacks=[early_stopping, model_save])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [12]:
#encoder
en_inp = Input(shape=(en_len,))
en_emb = Embedding(en_vocab, 96, input_length=en_len)
emb = en_emb(en_inp)
en_gru = GRU(48, return_state=True)
_, en_state = en_gru(emb)
encoder = Model(inputs=en_inp, outputs=en_state)

en_emb.set_weights(tr_en_emb.get_weights())
en_gru.set_weights(tr_en_gru.get_weights())

#decoder
de_inp = Input(shape=(1,))
de_emb = Embedding(fr_vocab, 96, input_length=fr_len-1)
de_emb_ = de_emb(de_inp)
de_inp_state = Input(shape=(48,))
de_gru = GRU(48, return_state=True)
de_out, de_out_state = de_gru(de_emb_, initial_state=de_inp_state)

#prediction
dense = Dense(fr_vocab, activation='softmax')
de_pred = dense(de_out)

de_emb.set_weights(tr_de_emb.get_weights())
de_gru.set_weights(tr_de_gru.get_weights())
dense.set_weights(tr_dense.get_weights())

decoder = Model(inputs=[de_inp, de_inp_state], outputs=[de_pred, de_out_state], name="Translation")

In [54]:
lis = [test_en[7170]]
en_new, _, _ = preprocess(lis, en_token, en_vocab, en_len, reverse=True)
de_s_t = encoder.predict(en_new)
de_new, _, _ =preprocess(['sos'], fr_token, fr_vocab, fr_len)

fr_sent = ''

for i in range(fr_len):
    print(decoder.predict([de_new, de_s_t]))
    pred = decoder.predict([de_new, de_s_t])
    de_prob, de_s_t = pred[0], pred[1]
    
    try:
        de_w = fr_token.index_word[np.argmax(de_prob, axis=-1)[0]]
    except:
        continue
    
    de_new = preprocess([de_w], fr_token, fr_vocab, fr_len)
    
    if de_w == 'eos':break
    
    fr_sent += de_w + ' '

[array([[1.04314193e-01, 7.97056110e-09, 5.12304045e-12, 4.53365535e-13,
        1.17453780e-08, 7.49214094e-11, 1.21905641e-05, 1.04647205e-12,
        7.61238515e-01, 3.31550954e-12, 9.14068642e-06, 3.94917129e-12,
        2.07047854e-02, 3.50291209e-08, 4.79963717e-07, 3.85369722e-11,
        3.26137693e-11, 2.37944904e-14, 5.99843580e-13, 3.68029522e-12,
        6.61693322e-09, 6.69743260e-17, 1.83020294e-15, 3.95958627e-16,
        2.42556825e-06, 9.09641150e-14, 8.69623040e-10, 7.61472760e-11,
        3.07925343e-14, 7.33144218e-14, 4.37942159e-04, 8.67038565e-08,
        2.64472146e-12, 7.72085566e-16, 3.40975945e-12, 2.69009363e-11,
        5.06725868e-12, 1.08122543e-13, 1.74982607e-15, 9.17021171e-05,
        5.94996987e-03, 3.77015422e-05, 4.09189990e-19, 2.29495937e-14,
        1.87519089e-09, 5.73398135e-11, 4.29622152e-16, 4.15466446e-12,
        3.49698162e-13, 5.91300175e-13, 6.94742267e-12, 1.68264870e-13,
        1.90349714e-09, 6.32683328e-09, 3.86832996e-11, 3.71211

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>", '(<class \'tuple\'> containing values of types {"<class \'numpy.ndarray\'>", "<class \'int\'>"})'}), <class 'NoneType'>