In [1]:
import pandas as pd
import re
import keras.backend
from keras.models import Model, load_model
from keras.layers import Masking,Reshape,Input, LSTM,Bidirectional ,Dense, Embedding,Lambda,Activation,Add,Flatten
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
import numpy as np
import os.path

import jieba
from nltk import word_tokenize
import os
os.environ["CUDA_VISIBLE_DEVICES"]="2"
checkpoint_path = 'weights.hdf5'
from keras import backend as K

Using TensorFlow backend.


In [2]:
from keras import backend
backend.tensorflow_backend._get_available_gpus()


['/job:localhost/replica:0/task:0/device:GPU:0']

### details in the data 
1.Chinese sentense will contain 'space' in it. It seems useless for us to understand the meaning as there is no 'space' in a chinese sentense in comparison to english.

2.Currently, I keep the punctuations ,such as ', . ?', in the sentence

3.Consider chinese as a phrase or word?
   -> use jieba to cut the phrase out 
4.Add start token and end token

5.use additional embedding to work with pre-train embedding

6.There are some words not in glove -> use special embedding along with the pretrain embedding to train those words.


In [3]:
cmn_ = []
with open('cmn.txt',encoding='UTF8') as f:
    for line in f:
        line = re.sub(r'[A-Z]',lambda x: x.group().lower(),line)
        [english,chinese] = line.replace('\n','').split('\t')
        chinese =  ['\t'] + list(jieba.cut(chinese.replace(' ',''))) + ['\n']

        english = word_tokenize(english)
        english = ['\t'] + english + ['\n']
        cmn_.append([english,chinese])


Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.207 seconds.
Prefix dict has been built succesfully.


In [4]:
cmn_data = pd.DataFrame(cmn_,columns=['english','chinese'])
del cmn_

In [5]:
en_token_set = []
for i_list in cmn_data["english"]:
    en_token_set += i_list
en_token_set = list(sorted(set(en_token_set)))
en_token_set.insert(0,'<unk>')

In [6]:
ch_token_set = []
for i_list in cmn_data["chinese"]:
    ch_token_set += i_list
ch_token_set = list(sorted(set(ch_token_set)))
ch_token_set.insert(0,'<unk>')

In [7]:
num_encoder_tokens = len(en_token_set)
num_decoder_tokens = len(ch_token_set)
epochs = 100
batch_size = 256
latent_dim = 256 # for chinese
num_samples = len(cmn_data)
max_encoder_seq_length = max([len(txt) for txt in cmn_data['english']])
max_decoder_seq_length = max([len(txt) for txt in cmn_data['chinese']])


 #### pre-train embedding

In [8]:
embeddings_index = {}
GLOVE_DIR = '.'
EMBEDDING_DIM = 100
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'),encoding='UTF8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


#### check if each english word is in glove

In [9]:
not_in_embedding = set()
for i in en_token_set:
    if i not in embeddings_index:
        not_in_embedding.add(i)
num_not_in_embedding = len(not_in_embedding)

In [10]:
embedding_matrix = np.zeros((num_encoder_tokens, EMBEDDING_DIM))
decode_embedding_matrix = np.zeros((num_decoder_tokens, EMBEDDING_DIM))
special_embedding_matrix = np.zeros((num_not_in_embedding, EMBEDDING_DIM))
# unk,start,end token will be the first both in special embedding and embedding
cur = 0
special_token = []
embedding_index_copy = dict()
for i,word in enumerate(en_token_set):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i-cur] = embedding_vector
        embedding_index_copy[word] = embedding_vector
    else:
        special_token.append(word)
        cur+=1
embedding_index = embedding_index_copy
del embedding_index_copy

In [11]:
#put the special tokens behind the back
special_list = []
for i in special_token:
    en_token_set.remove(i)
    special_list.append(i)
en_token_set = special_list + en_token_set
del special_list

## Create keras model

#### Firstly,  prepare input data.
Create space[0,1,2] for unknown,start,end token at start.


In [12]:
input_token_index = dict(
    [(word, i) for i, word in enumerate(en_token_set)])
target_token_index = dict(
    [(word, i) for i, word in enumerate(ch_token_set)])

#### the last dimension of input data will be 1 rather than one-hot(num_tokens)
Input to embedding layers should be a integer
-> this (batch,timestep,input) will be expanded to (batch,timestep,input,embed_size)

use reshape layer to discard the unwanted dimension 
-> (batch,timestep,embed_size)

In [91]:
import keras.utils
import numpy as np   

class mygenerator(keras.utils.Sequence):
    def __init__(self, x_set, y_set, batch_size):
        self.x, self.y = x_set, y_set
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.x) / float(self.batch_size)))
    @staticmethod
    def get_encoder_input_example(input_text):
        example = np.zeros((max_encoder_seq_length,1))
        for t, word in enumerate(input_text):
            example[t, 0] = input_token_index[word]
        return example
    @staticmethod
    def get_decoder_input_example(target_text):
        example = np.zeros((max_decoder_seq_length,1))
        for t, word in enumerate(target_text):
            example[t, 0] = target_token_index[word]
        return example
    @staticmethod
    def get_decoder_output_example(target_text):
        example = np.zeros((max_decoder_seq_length,num_decoder_tokens))
        for t, word in enumerate(target_text):
            if t > 0:
                example[t-1, target_token_index[word]] = 1.
        return example
    def __getitem__(self, idx):
        batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
        batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]

        x_encode_input = [self.get_encoder_input_example(encoder_input) for encoder_input in batch_x]
        y_decode_input = [self.get_decoder_input_example(decoder_input) for decoder_input in batch_y]
        y_decode_output = [self.get_decoder_output_example(decoder_output) for decoder_output in batch_y]
        return [np.array(x_encode_input),np.array(y_decode_input)], np.array(y_decode_output)

    # read your data here using the batch lists, batch_x and batch_y
 

In [14]:
def create_model(checkpoint_path = None):
    # Define an input sequence and process it.
    encoder_inputs = Input(shape=(max_encoder_seq_length, 1))
    embedding_layer = Embedding(input_dim=num_encoder_tokens,
                            output_dim=EMBEDDING_DIM,
                            mask_zero = True,
                            weights = [embedding_matrix],
                            input_length = max_encoder_seq_length,
                            trainable = False)
    special_embedding_layer = Embedding(len(special_token),
                            EMBEDDING_DIM,
                            mask_zero = True,
                            weights = [special_embedding_matrix],
                            input_length = max_encoder_seq_length,
                            trainable = True)

    valid_words = num_encoder_tokens - len(special_token)

    # Create a vector of special tokens, e.g: [0,0,1,0,3,0,0]
    special_tokens_input = Lambda(lambda x: x - valid_words)(encoder_inputs)
    special_tokens_input = Activation('relu')(special_tokens_input)

    # Apply both 'normal' embeddings and special token embeddings
    embedded_sequences = embedding_layer(encoder_inputs)
    embedded_special = special_embedding_layer(special_tokens_input)
   
    # Add the matrices
    embedded_sequences = Add()([embedded_sequences, embedded_special])
    embedded_sequences = Lambda(lambda x: x, output_shape=lambda s:s)(embedded_sequences)
    embedded_sequences = Reshape((-1,EMBEDDING_DIM))(embedded_sequences)
    encoder = LSTM(latent_dim, return_state=True)
    encoder_outputs, state_h, state_c = encoder(embedded_sequences)
    # We discard `encoder_outputs` and only keep the states.
    encoder_states = [state_h, state_c]

    # Set up the decoder, using `encoder_states` as initial state.
    decoder_inputs = Input(shape=(None, 1))
    decode_embedding_layer = Embedding(num_decoder_tokens,
                            100,
                            mask_zero = True,
                            weights = [decode_embedding_matrix],
                            input_length = max_decoder_seq_length,
                            trainable = True)

    # We set up our decoder to return full output sequences,
    # and to return internal states as well. We don't use the
    # return states in the training model, but we will use them in inference.
    decoder_seq = decode_embedding_layer(decoder_inputs)
    decoder_seq = Lambda(lambda x: x, output_shape=lambda s:s)(decoder_seq)
    
    decoder_seq = Reshape((-1,EMBEDDING_DIM))(decoder_seq)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_seq,
                                         initial_state=encoder_states)

    decoder_dense = Dense(num_decoder_tokens, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    # Define the model that will turn
    # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

    # Define sampling models
    encoder_model = Model(encoder_inputs, encoder_states)

    decoder_state_input_h = Input(shape=(latent_dim,))
    decoder_state_input_c = Input(shape=(latent_dim,))
    decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
    decoder_outputs, state_h, state_c = decoder_lstm(
        decoder_seq, initial_state=decoder_states_inputs)
    decoder_states = [state_h, state_c]
    decoder_outputs = decoder_dense(decoder_outputs)
    decoder_model = Model(
        [decoder_inputs] + decoder_states_inputs,
        [decoder_outputs] + decoder_states )


    return model,encoder_model,decoder_model

In [15]:
# Run training
K.set_learning_phase(True)
model,encoder_model,decoder_model = create_model(checkpoint_path=checkpoint_path)
model.load_weights('weights.hdf5')


In [27]:
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
checkpoints_callback = ModelCheckpoint(filepath=checkpoint_path,
                                       monitor='acc',
                                       verbose=1,
                                       save_best_only=True)
early_stopping = EarlyStopping(monitor='loss',patience=5)
reduceLRonPlateau = ReduceLROnPlateau(monitor='loss')
mygen = mygenerator(cmn_data['english'],cmn_data['chinese'],batch_size=128)

In [25]:
history = model.fit_generator(generator=mygen,use_multiprocessing=True,
                              epochs = 5,
          callbacks=[checkpoints_callback,early_stopping,reduceLRonPlateau])

Epoch 1/5


  str(node.arguments) + '. They will not be included '


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)
    t = states_value
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 32, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, 0] = target_token_index['\t']
    
    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ''
    index = 0
    while not stop_condition:
#         print(target_seq.shape)
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value)
#         print(output_tokens.shape)
        # Sample a token
        temp = ""
        for indd in range(31):
            sampled_token_index = np.argmax(output_tokens[0, indd, :])
            sampled_char = ch_token_set[sampled_token_index]
            temp += sampled_char
#         print(temp)
        sampled_token_index = np.argmax(output_tokens[0, index, :])
        sampled_char = ch_token_set[sampled_token_index]
        decoded_sentence += sampled_char
        index+=1
         # Exit condition: either hit max length
        # or find stop character.
        if (sampled_char == '\n' or
           index > max_decoder_seq_length):
            stop_condition = True
#         print(decoded_sentence)
        # Update the target sequence (of length 1).
#         target_seq = np.zeros((1, 1, 1))
        target_seq[0, index, 0] = sampled_token_index
        # Update states
#         states_value = [h, c]
    
    return decoded_sentence

In [93]:
for i in [4077, 2122, 3335, 1464, 8956, 7168, 3490,
        4495, 5100, 119]:
    i-=1
    
    input_seq = mygen.get_encoder_input_example(mygen,cmn_data['english'].iloc[i])
    input_seq = np.array(input_seq)
    input_seq = np.expand_dims(input_seq,0)
    # print(np.array(input_seq).shape)
    decoded_sentence = decode_sequence([input_seq])
    with open('output.txt','a',encoding='UTF8') as f:
        input_line = "Input sequence: "
        for e in cmn_data['english'].iloc[i]:
            input_line += e + ' '
        output_line = "Output sequence "
        for fi in decoded_sentence:
            output_line += fi + ' '
        print(([input_line,output_line]))
              

['Input sequence: \t he is afraid of snakes . \n ', 'Output sequence 他 怕 那 只 狗 。 \n ']
['Input sequence: \t i miss you so much . \n ', 'Output sequence 我 同 意 你 。 \n ']
["Input sequence: \t we 're going by train . \n ", 'Output sequence 我 們 應 該 去 野 餐 。 \n ']
['Input sequence: \t the sky is clear . \n ', 'Output sequence 這 個 瓶 子 裝 滿 了 。 \n ']
['Input sequence: \t wearing a suit , he stood out . \n ', 'Output sequence 他 的 名 字 為 了 我 已 故 的 教 育 有 人 知 道 。 \n ']
['Input sequence: \t she made a serious mistake . \n ', 'Output sequence 她 是 個 很 棒 的 女 人 。 \n ']
['Input sequence: \t have you eaten dinner ? \n ', 'Output sequence 你 有 晚 飯 嗎 ？ \n ']
['Input sequence: \t what do you want to be ? \n ', 'Output sequence 你 想 做 什 麼 ？ \n ']
['Input sequence: \t tom is going to help us . \n ', 'Output sequence 汤 姆 想 告 诉 我 们 我 们 的 位 置 。 \n ']
["Input sequence: \t he 's lazy . \n ", 'Output sequence 他 非 常 親 切 。 \n ']


In [None]:
decoder_model.summary()

### You can test it yourself by input to input_seq

We can look into the output. There's definitely a problem of overfitting.

If it is in the dataset -> correct output

not in the dataset -> non-sense (even if each word is in our vocabulary)

In [88]:
input_seq = ['he','is','good']
input_seq = ['\t'] + input_seq + ['\n']
input_ = np.zeros((1,38,1))
for i in range(len(input_seq)):
    if input_seq[i] in input_token_index:
        input_[0,i,0] = input_token_index[input_seq[i]]
    else:
        input_[0,i,0] = input_token_index['<unk>']
        print(i)
decoded_sentence = decode_sequence(input_)
print(decoded_sentence)

他是个好人。



In [None]:
import random
input_seq = cmn_data['english'].iloc[random.randint(1000,1500)]
len_input = len(input_seq)
input_ = np.zeros((1,38,1))
for i in range(len_input):
    if input_seq[i] in input_token_index:
        input_[0,i,0] = input_token_index[input_seq[i]]
    else:
        input_[0,i,0] = input_token_index['<unk>']
decoded_sentence = decode_sequence(input_)
print(input_seq)
print(decoded_sentence)