In [3]:
# python3
# -*- coding: utf-8 -*-


import numpy as np
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt
import json
import os


In [4]:
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.3
set_session(tf.Session(config=config))

SEQ_LENGTH = 3     
MAX_NB_WORDS = 10000    
EMBEDDING_DIM = 512     
EMBEDDING_DIM_2 = 512     
EMBEDDING_DIM_3 = 256
BATCH_SIZE = 1024   
EPOCHS = 100    


In [5]:
def cutWords(file_name):
    with open(file_name, 'r', encoding='utf8') as f:
        content = f.read().replace('\n', '。')   # 使用句号作为句子的结束符
        f.close()
    return list(content)

def mapWords(cut_word_list):
    """
     set word2index and index2word to build dictionary
    :param cut_word_list: Character-level token
    :return:word2index和index2word， key <=> value
    """
    vocabulary = sorted(list(set(cut_word_list)))
    word_to_index = dict((w, i+2) for i, w in enumerate(vocabulary))
    word_to_index["PAD"] = 0   # 填补
    word_to_index["UNK"] = 1   # unknown
    index_to_word = dict((index, word) for word, index in word_to_index.items())

    word_to_index_json = json.dumps(word_to_index)
    index_to_word_json = json.dumps(index_to_word)
    with open('./word_to_index_word.txt', 'w', encoding='utf8') as w:
        w.write(word_to_index_json)
        w.close()
    with open('./index_to_word_word.txt', 'w', encoding='utf8') as w:
        w.write(index_to_word_json)
        w.close()
    # print("len of word_to_index::", len(word_to_index))
    # print("len of index_to_word::", len(index_to_word))
    return word_to_index, index_to_word

In [None]:
def generateTrainData(cut_word_list, word_to_index):
    """
    :return:X_train, X_val, y_train, y_val：training and validation
    """
    # 生成训练数据
    X_data = []
    y_data = []
    data_index = []
    n_all_words = len(cut_word_list)
    for i in range(0, n_all_words - SEQ_LENGTH - 1):
        seq_x_y = cut_word_list[i: i+SEQ_LENGTH + 1]   # SEQ_LENGTH Chinese characters correspond to the next (SEQ_LENGTH+1)th Chinese characters
        index_x_y = [word_to_index[elem] for elem in seq_x_y]    
        data_index.append(index_x_y)
    np.random.shuffle(data_index)
    for i in range(0, len(data_index)):
        X_data.append(data_index[i][:SEQ_LENGTH])
        y_data.append(data_index[i][SEQ_LENGTH])

    #list => tensor
    X = np.reshape(X_data, (len(X_data), SEQ_LENGTH))
    y = np_utils.to_categorical(y_data)
   
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1)

    return X_train, X_val, y_train, y_val

In [None]:
def model_lstm(X_train, X_val, y_train, y_val, word_to_index):
    """
    Using Tensorboard as call_back 
    """
    input_shape = (SEQ_LENGTH,)
    x_train_in = Input(input_shape, dtype='int32', name="x_train")

    # word_index存储的是所有vocabulary的映射关系
    nb_words = min(MAX_NB_WORDS, len(word_to_index))
    embedding_layer = Embedding(nb_words, 256, input_length=SEQ_LENGTH)(x_train_in)
    print("embedding layer is::", embedding_layer)
    print("build model.....")

    # return_sequences=True表示返回的是序列，否则下面的LSTM无法使用，但是如果下一层不是LSTM，则可以不写
    lstm_1 = Bidirectional(LSTM(EMBEDDING_DIM, name="LSTM_1", return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)
    #drop_1=Dropout(0.2)(lstm_1)
    lstm_2 = Bidirectional(LSTM(EMBEDDING_DIM_2, name="LSTM_2", return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(lstm_1)#(drop_1)
    #drop_2=Dropout(0.2)(lstm_2)
    lstm_3 = Bidirectional(LSTM(EMBEDDING_DIM_3, name="LSTM_3", dropout=0.2, recurrent_dropout=0.2))(lstm_2)#(drop_2)
    #drop_3=Dropout(0.2)(lstm_3)
    dense = Dense(nb_words, activation="softmax", name="Dense_1")(lstm_3)#(drop_3)

    model = Model(inputs=x_train_in, outputs=dense)
    print(model.summary())

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    print("Train....")

    # save tensorboard info
    tensorboard = TensorBoard(log_dir='./tensorboard_log/')
    # save best model.
    checkpoint = ModelCheckpoint(filepath='./model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5',
                                 monitor='val_loss', mode='min', save_best_only=True, save_weights_only=False, period=1, verbose=1)
    reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)
    callback_list = [tensorboard, checkpoint, reduce]

    history_record = model.fit(X_train, y_train,
                            batch_size=BATCH_SIZE,
                            validation_data=(X_val, y_val),
                            epochs=EPOCHS,
                            callbacks=callback_list
                             )
    #validation_data=(X_val, y_val),
    model.save('./model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5')
    return history_record

In [None]:
def plotAccuray(history_record):
    """
    plot the accuracy and loss line. 
    :param history_record:
    :return:
    """
    accuracy_train = history_record.history["acc"]
    accuracy_val= history_record.history["val_acc"]
    loss_train = history_record.history["loss"]
    loss_val = history_record.history["val_loss"]
    epochs = range(len(accuracy_train))
    plt.plot(epochs, accuracy_train, 'bo', label='Training accuracy')
    plt.plot(epochs, accuracy_val, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss_train, 'bo', label='Training loss')
    plt.plot(epochs, loss_val, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


In [None]:
file_name = "./train_data/all_5.txt"
cut_word_list = cutWords(file_name)
word_to_index, index_to_word = mapWords(cut_word_list)
X_train, X_val, y_train, y_val = generateTrainData(cut_word_list, word_to_index)
history_record = model_lstm(X_train, X_val, y_train, y_val, word_to_index)