In [1]:
# python3
# -*- coding: utf-8 -*-

import numpy as np
from keras.utils import np_utils
from sklearn.model_selection import train_test_split
from keras.models import Model, load_model
from keras.layers import Embedding, LSTM, Dense, Input, Dropout, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import TensorBoard
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
import matplotlib.pyplot as plt
import json
import os
import gc

Using TensorFlow backend.


In [2]:
import keras
import tensorflow as tf
a = tf.test.is_built_with_cuda()
b = tf.test.is_gpu_available(
    cuda_only=False,
    min_cuda_compute_capability=None
)     
print(a)
print(b)


True
True


In [3]:
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 0.8
# set_session(tf.Session(config=config))
from deap import base, creator, tools, algorithms
from sklearn.metrics import mean_squared_error
from deap import base, creator, tools, algorithms
from scipy.stats import bernoulli
from bitstring import BitArray
# SEQ_LENGTH = 3     
MAX_NB_WORDS = 10000    
# EMBEDDING_DIM = 512     
# EMBEDDING_DIM_2 = 512     
# EMBEDDING_DIM_3 = 256
BATCH_SIZE = 256
EPOCHS = 30


In [4]:
def cutWords(file_name):
    with open(file_name, 'r', encoding='utf8') as f:
        content = f.read().replace('\n', '。')   # 使用句号作为句子的结束符
        f.close()
    return list(content)

def mapWords(cut_word_list):
    """
     set word2index and index2word to build dictionary
    :param cut_word_list: Character-level token
    :return:word2index和index2word， key <=> value
    """
    vocabulary = sorted(list(set(cut_word_list)))
    word_to_index = dict((w, i+2) for i, w in enumerate(vocabulary))
    word_to_index["PAD"] = 0   # 填补
    word_to_index["UNK"] = 1   # unknown
    index_to_word = dict((index, word) for word, index in word_to_index.items())

    word_to_index_json = json.dumps(word_to_index)
    index_to_word_json = json.dumps(index_to_word)
    with open('./word_to_index_word.txt', 'w', encoding='utf8') as w:
        w.write(word_to_index_json)
        w.close()
    with open('./index_to_word_word.txt', 'w', encoding='utf8') as w:
        w.write(index_to_word_json)
        w.close()
    # print("len of word_to_index::", len(word_to_index))
    # print("len of index_to_word::", len(index_to_word))
    return word_to_index, index_to_word

In [5]:
def generateTrainData(cut_word_list, word_to_index, SEQ_LENGTH):
    """
    :return:X_train, X_val, y_train, y_val：training and validation
    """
    # 生成训练数据
    X_data = []
    y_data = []
    data_index = []
    n_all_words = len(cut_word_list)
    for i in range(0, n_all_words - SEQ_LENGTH - 1):
        seq_x_y = cut_word_list[i: i+SEQ_LENGTH + 1]   # SEQ_LENGTH Chinese characters correspond to the next (SEQ_LENGTH+1)th Chinese characters
        index_x_y = [word_to_index[elem] for elem in seq_x_y]    
        data_index.append(index_x_y)
#     np.random.shuffle(data_index)
    for i in range(0, len(data_index)):
        X_data.append(data_index[i][:SEQ_LENGTH])
        y_data.append(data_index[i][SEQ_LENGTH])

    #list => tensor
    del data_index
    gc.collect()
    X = np.reshape(X_data, (len(X_data), SEQ_LENGTH))
    del X_data
    gc.collect()
    
    y = np_utils.to_categorical(y_data)
    del y_data
    gc.collect()
    
    
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=False)
    del X, y
    gc.collect()
    
    return X_train, X_val, y_train, y_val

In [6]:
file_name = "./train_data/all_5.txt"
cut_word_list = cutWords(file_name)
word_to_index, index_to_word = mapWords(cut_word_list)

In [8]:
def model_lstm(ga_individual_solution):
    """
    Using Tensorboard as call_back 
    """
#     file_name = "./train_data/all_5.txt"
#     cut_word_list = cutWords(file_name)
#     word_to_index, index_to_word = mapWords(cut_word_list)

    SEQ_LENGTH_bits = BitArray(ga_individual_solution[0:3])
    SEQ_LENGTH = SEQ_LENGTH_bits.uint+1
    
    EMBEDDING_DIM_bits = BitArray(ga_individual_solution[3:]) 
    EMBEDDING_DIM = EMBEDDING_DIM_bits.uint + 1
    
    X_train, X_val, y_train, y_val = generateTrainData(cut_word_list, word_to_index, SEQ_LENGTH)
#     Hidden_size_1_bits = BitArray(ga_individual_solution[8:17]) 
#     Hidden_size_1 = Hidden_size_1_bits.uint
#     Hidden_size_1 = Hidden_size_1+1 if Hidden_size_1==0 else Hidden_size_1
    
#     Hidden_size_2_bits = BitArray(ga_individual_solution[17:]) 
#     Hidden_size_2 = Hidden_size_2_bits.uint
#     Hidden_size_2 = Hidden_size_2+1 if Hidden_size_2==0 else Hidden_size_2
    
#     Hidden_size_3_bits = BitArray(ga_individual_solution[26:]) 
#     Hidden_size_3 = Hidden_size_3_bits.uint
#     Hidden_size_3 = Hidden_size_3+1 if Hidden_size_3==0 else Hidden_size_3
    
    Hidden_size_1 = 512
    Hidden_size_2 = 512
    Hidden_size_3 = 256
    
    print('\nSEQ_LENGTH: ', SEQ_LENGTH, ', EMBEDDING_DIM: ', EMBEDDING_DIM)#, ', Hidden_size_1: ', Hidden_size_1, ', Hidden_size_2 ', Hidden_size_2, ', Hidden_size_3', Hidden_size_3)
    
    #print(X_train.dtype, y_train.dtype)
    nb_words = min(MAX_NB_WORDS, len(word_to_index))
    input_shape = (SEQ_LENGTH,)
    x_train_in = Input(input_shape, dtype='int32', name="x_train")

    # word_index存储的是所有vocabulary的映射关系
    embedding_layer = Embedding(nb_words, EMBEDDING_DIM, input_length=SEQ_LENGTH)(x_train_in)
    print("embedding layer is::", embedding_layer)
    print("build model.....")

    # return_sequences=True表示返回的是序列，否则下面的LSTM无法使用，但是如果下一层不是LSTM，则可以不写
    lstm_1 = Bidirectional(LSTM(Hidden_size_1, name="LSTM_1", return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)
    #drop_1=Dropout(0.2)(lstm_1)
    lstm_2 = Bidirectional(LSTM(Hidden_size_2, name="LSTM_2", return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(lstm_1)#(drop_1)
    #drop_2=Dropout(0.2)(lstm_2)
    lstm_3 = Bidirectional(LSTM(Hidden_size_3, name="LSTM_3", dropout=0.2, recurrent_dropout=0.2))(lstm_2)#(drop_2)
    #drop_3=Dropout(0.2)(lstm_3)
    dense = Dense(nb_words, activation="softmax", name="Dense_1")(lstm_3)#(drop_3)

    model = Model(inputs=x_train_in, outputs=dense)
    #print(model.summary())

    adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=adam,
                  metrics=['accuracy'])
    print("Train....")

    # save tensorboard info
#     tensorboard = TensorBoard(log_dir='./tensorboard_log/')
#     # save best model.
#     checkpoint = ModelCheckpoint(filepath='./model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5',
#                                  monitor='val_loss', mode='min', save_best_only=True, save_weights_only=False, period=1, verbose=1)
#     reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)
#     callback_list = [tensorboard, checkpoint, reduce]

    history_record = model.fit(X_train, y_train,
                            batch_size=BATCH_SIZE,
                            validation_data=(X_val, y_val),
                            epochs=EPOCHS
                             )
    #model.save('./model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5')
#     evaluate_val = model.evaluate(X_train, y_train, batch_size=BATCH_SIZE)
    print('\nloss: ', history_record.history['loss'][-1], ', val_loss: ', history_record.history['val_loss'][-1])
#     print(model.metrics_names)
    del X_train, X_val, y_train, y_val
    gc.collect()
    del x_train_in, embedding_layer, lstm_1, lstm_2, lstm_3, dense, model
    gc.collect()
    
    loss = history_record.history['loss'][-1]
    val_loss = history_record.history['val_loss'][-1]
    del history_record
    gc.collect()


    return loss, val_loss

In [9]:
population_size = 6
num_generations = 6
gene_length = 11

# As we are trying to minimize the Loss and the Validation_loss, that's why using -1.0. 
# In case, when you want to maximize accuracy for instance, use 1.0
creator.create('FitnessMulti', base.Fitness, weights = (-1.0, -1.0))
creator.create('Individual', list , fitness = creator.FitnessMulti)

toolbox = base.Toolbox()
toolbox.register('binary', bernoulli.rvs, 0.5)
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.binary, n = gene_length)
toolbox.register('population', tools.initRepeat, list , toolbox.individual)

toolbox.register('mate', tools.cxOrdered)
toolbox.register('mutate', tools.mutShuffleIndexes, indpb = 0.6)
toolbox.register('select', tools.selRoulette)
toolbox.register('evaluate', model_lstm)

population = toolbox.population(n = population_size)
r = algorithms.eaSimple(population, toolbox, cxpb = 0.4, mutpb = 0.2, ngen = num_generations, verbose = False)


SEQ_LENGTH:  7 , EMBEDDING_DIM:  74
embedding layer is:: Tensor("embedding_1/embedding_lookup/Identity:0", shape=(?, 7, 74), dtype=float32)
build model.....
Train....
Train on 137200 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  1.6089082196294044 , val_loss:  7.185605713311631

SEQ_LENGTH:  8 , EMBEDDING_DIM:  44
embedding layer is:: Tensor("embedding_2/embedding_lookup/Identity:0", shape=(?, 8, 44), dtype=float32)
build model.....
Train....
Train on 137199 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 

Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  1.8734254245416622 , val_loss:  6.993621157505114

SEQ_LENGTH:  7 , EMBEDDING_DIM:  130
embedding layer is:: Tensor("embedding_3/embedding_lookup/Identity:0", shape=(?, 7, 130), dtype=float32)
build model.....
Train....
Train on 137200 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  1.3856628734516332 , val_loss:  7.271111948242263

SEQ_LENGTH:  7 , EMBEDDING_DIM:  229
embedding layer is:: Tensor("embedding_4/embedding_lookup/Identity:0", shape=(?, 7, 229), dtype=float32)
build model.....
Train....
Train on 137200 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/

Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  2.1699892168954547 , val_loss:  7.001917962145907

SEQ_LENGTH:  1 , EMBEDDING_DIM:  152
embedding layer is:: Tensor("embedding_7/embedding_lookup/Identity:0", shape=(?, 1, 152), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15246 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.091650811084822 , val_loss:  6.002639337332249

SEQ_LENGTH:  6 , EMBEDDING_DIM:  249
embedding layer is:: Tensor("embedding_8/embedd

Epoch 30/30

loss:  2.52865066091641 , val_loss:  6.79207150839946

SEQ_LENGTH:  1 , EMBEDDING_DIM:  119
embedding layer is:: Tensor("embedding_10/embedding_lookup/Identity:0", shape=(?, 1, 119), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15246 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.09645114433122 , val_loss:  5.990752801328417

SEQ_LENGTH:  2 , EMBEDDING_DIM:  31
embedding layer is:: Tensor("embedding_11/embedding_lookup/Identity:0", shape=(?, 2, 31), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8

Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  2.5597767844248267 , val_loss:  6.7199353356954115

SEQ_LENGTH:  1 , EMBEDDING_DIM:  152
embedding layer is:: Tensor("embedding_12/embedding_lookup/Identity:0", shape=(?, 1, 152), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15246 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.065287337319322 , val_loss:  6.013496304721825

SEQ_LENGTH:  1 , EMBEDDING_DIM:  152
embedding layer is:: Tensor("embedding_13/embedding_lookup/Identity:0", shape=(?, 1, 152), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15246 samples
Epoch

Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.072361211747888 , val_loss:  5.984848031468487

SEQ_LENGTH:  1 , EMBEDDING_DIM:  152
embedding layer is:: Tensor("embedding_14/embedding_lookup/Identity:0", shape=(?, 1, 152), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15246 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.076203750683891 , val_loss:  5.980354868425842

SEQ_LENGTH:  1 , EMBEDDING_DIM:  152
embedding layer is:: Tensor("embedding_15/embedding_lookup/Identity:0", shape=(?, 1, 152), dtype=float32)
b

Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.086204052385003 , val_loss:  6.007884079181161

SEQ_LENGTH:  1 , EMBEDDING_DIM:  152
embedding layer is:: Tensor("embedding_16/embedding_lookup/Identity:0", shape=(?, 1, 152), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15246 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  4.082714736792234 , val_loss:  5.99499081570724

SEQ_LENGTH:  6 , EMBEDDING_DIM:  105
embedding layer is:: Tensor("embeddi

Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  1.283999028878791 , val_loss:  7.374849812137607

SEQ_LENGTH:  5 , EMBEDDING_DIM:  169
embedding layer is:: Tensor("embedding_18/embedding_lookup/Identity:0", shape=(?, 5, 169), dtype=float32)
build model.....
Train....
Train on 137202 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30



loss:  0.9445690258341929 , val_loss:  7.6297400824043855

SEQ_LENGTH:  2 , EMBEDDING_DIM:  88
embedding layer is:: Tensor("embedding_19/embedding_lookup/Identity:0", shape=(?, 2, 88), dtype=float32)
build model.....
Train....
Train on 137205 samples, validate on 15245 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30

loss:  2.2274053323799317 , val_loss:  6.940214381385295


In [10]:
best_individuals = tools.selBest(population,k = 1)

SEQ_LENGTH = None
EMBEDDING_DIM = None

for bi in best_individuals:
    SEQ_LENGTH_bits = BitArray(bi[0:3])
    SEQ_LENGTH = SEQ_LENGTH_bits.uint

    EMBEDDING_DIM_bits = BitArray(bi[3:]) 
    EMBEDDING_DIM = EMBEDDING_DIM_bits.uint
    
    print('\nSEQ_LENGTH: ', SEQ_LENGTH, ', EMBEDDING_DIM: ', EMBEDDING_DIM)#, ', Hidden_size_1: ', Hidden_size_1, ', Hidden_size_2 ', Hidden_size_2)#, ', Hidden_size_3', Hidden_size_3)


SEQ_LENGTH:  4 , EMBEDDING_DIM:  168


In [4]:
def loadParam(model_file, word2index_file, index2word_file):
    """
    load model and word2index_file, index2word_file
    :param model_file:
    :param word2index_file:
    :param index2word_file:
    :return:
    """
    # get model.
    model = load_model(model_file)
    # get the word2index and index2word data.
    with open(word2index_file, 'r', encoding='utf8') as f:
        json_obj = f.read()
        word2index = json.loads(json_obj)
        f.close()
    with open(index2word_file, 'r', encoding='utf8') as f:
        json_obj = f.read()
        index2word = json.loads(json_obj)
        f.close()
    index2word_new = {}
    for key, value in index2word.items():
        index2word_new[int(key)] = value
    return model, word2index, index2word_new

def sample(preds, diversity = 1.0):
    """
    get the max probability index.
    :param preds: prediction
    :param diversity:
    :return:
    """
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds + 1e-10) / diversity
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

def lyrics_generate(start, model, word2index, index2word, SEQ_LENGTH, generate_maxlen):
    """
    generate lyrics according start sentence.
    :param start: startWith sentence
    :param model:
    :param word2index:
    :param index2word:
    :param maxlen: the length of generating sentence.
    :return:
    """
    sentence = start[:SEQ_LENGTH]   
    diversity = 1.0
    while len(sentence) < generate_maxlen:
        
        x_pred = np.zeros((1, SEQ_LENGTH))    

        min_index = max(0, len(sentence) - SEQ_LENGTH)    
        for idx in range(min_index, len(sentence)):
            x_pred[0, SEQ_LENGTH - len(sentence) + idx] = word2index.get(sentence[idx], 1)  

        preds = model.predict(x_pred, verbose=0)[0]   
        next_index = sample(preds, diversity)   
        next_word = index2word[next_index]
        if not (next_word == '。' and sentence[-1] == '。'):   
            sentence = sentence + next_word  
    return sentence

In [22]:
BATCH_SIZE = 512
EPOCHS = 50
SEQ_LENGTH = 5
EMBEDDING_DIM = 169

X_train, X_val, y_train, y_val = generateTrainData(cut_word_list, word_to_index, SEQ_LENGTH)

Hidden_size_1 = 512
Hidden_size_2 = 512
Hidden_size_3 = 256

print('\nSEQ_LENGTH: ', SEQ_LENGTH, ', EMBEDDING_DIM: ', EMBEDDING_DIM, ', Hidden_size_1: ', Hidden_size_1, ', Hidden_size_2 ', Hidden_size_2, ', Hidden_size_3', Hidden_size_3)

nb_words = min(MAX_NB_WORDS, len(word_to_index))
input_shape = (SEQ_LENGTH,)
x_train_in = Input(input_shape, dtype='int32', name="x_train")

# word_index存储的是所有vocabulary的映射关系
embedding_layer = Embedding(nb_words, EMBEDDING_DIM, input_length=SEQ_LENGTH)(x_train_in)
print("embedding layer is::", embedding_layer)
print("build model.....")

# return_sequences=True表示返回的是序列，否则下面的LSTM无法使用，但是如果下一层不是LSTM，则可以不写
lstm_1 = Bidirectional(LSTM(Hidden_size_1, name="LSTM_1", return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)
#drop_1=Dropout(0.2)(lstm_1)
lstm_2 = Bidirectional(LSTM(Hidden_size_2, name="LSTM_2", return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(lstm_1)#(drop_1)
#drop_2=Dropout(0.2)(lstm_2)
lstm_3 = Bidirectional(LSTM(Hidden_size_3, name="LSTM_3", dropout=0.2, recurrent_dropout=0.2))(lstm_2)#(drop_2)
#drop_3=Dropout(0.2)(lstm_3)
dense = Dense(nb_words, activation="softmax", name="Dense_1")(lstm_3)#(drop_3)

model = Model(inputs=x_train_in, outputs=dense)
print(model.summary())

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])
print("Train....")

# save tensorboard info
tensorboard = TensorBoard(log_dir='./tensorboard_log/')
# save best model.
checkpoint = ModelCheckpoint(filepath='./model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5',
                             monitor='val_loss', mode='min', save_best_only=True, save_weights_only=False, period=1, verbose=1)
reduce = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5)
callback_list = [tensorboard, checkpoint, reduce]

history_record = model.fit(X_train, y_train,
                        batch_size=BATCH_SIZE,
                        validation_data=(X_val, y_val),
                        epochs=EPOCHS,
                        callbacks=callback_list
                         )
model.save('./model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5')

del X_train, X_val, y_train, y_val
gc.collect()
del x_train_in, embedding_layer, lstm_1, lstm_2, lstm_3, dense, model
gc.collect()


SEQ_LENGTH:  5 , EMBEDDING_DIM:  169 , Hidden_size_1:  512 , Hidden_size_2  512 , Hidden_size_3 256
embedding layer is:: Tensor("embedding_3/embedding_lookup/Identity:0", shape=(?, 5, 169), dtype=float32)
build model.....
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
x_train (InputLayer)         (None, 5)                 0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 5, 169)            576628    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 5, 1024)           2793472   
_________________________________________________________________
bidirectional_8 (Bidirection (None, 5, 1024)           6295552   
_________________________________________________________________
bidirectional_9 (Bidirection (None, 512)               2623488   
___________________________________________________


Epoch 00033: val_loss did not improve from 5.67287
Epoch 34/50

Epoch 00034: val_loss did not improve from 5.67287
Epoch 35/50

Epoch 00035: val_loss did not improve from 5.67287
Epoch 36/50

Epoch 00036: val_loss did not improve from 5.67287
Epoch 37/50

Epoch 00037: val_loss did not improve from 5.67287
Epoch 38/50

Epoch 00038: val_loss did not improve from 5.67287
Epoch 39/50

Epoch 00039: val_loss did not improve from 5.67287
Epoch 40/50

Epoch 00040: val_loss did not improve from 5.67287
Epoch 41/50

Epoch 00041: val_loss did not improve from 5.67287
Epoch 42/50

Epoch 00042: val_loss did not improve from 5.67287
Epoch 43/50

Epoch 00043: val_loss did not improve from 5.67287
Epoch 44/50

Epoch 00044: val_loss did not improve from 5.67287
Epoch 45/50

Epoch 00045: val_loss did not improve from 5.67287
Epoch 46/50

Epoch 00046: val_loss did not improve from 5.67287
Epoch 47/50

Epoch 00047: val_loss did not improve from 5.67287
Epoch 48/50

Epoch 00048: val_loss did not improve f

0

In [8]:
SEQ_LENGTH = 5
model_file = './model_epoch50_2lstm_1dense_seq50_phrase_based_best.h5'
word2index_file = './word_to_index_word.txt'
index2word_file = './index_to_word_word.txt'
model, word2index, index2word = loadParam(model_file, word2index_file, index2word_file)
generate_maxlen = 200

In [11]:
start = "边塞"
print(lyrics_generate(start, model, word2index, index2word, SEQ_LENGTH, generate_maxlen))

边塞前花染红尘。帝澜我我梦里。藏我纵然年华。或当老酒自我。烬窗下。满琴意。任罪为行。用明月极胸丘。人人绘骨过往。最低心。像天灵兰唱。烈与墨怀覆。为君共赏眷轻怨念。心年谁敌解。乐绎与你长何。年少转途娥。斜阳寒寒。结紫罗缠绵重规连。像一端十尘。弦风急。可有平生不曾负得。桃花坞。结归去。如得当日孤行中。此后。童法枯藤。抚剑阁卷。埋葬了谁的称慕。衷的心酒寻画。等松风泣夜。说人抚琴枯凉。痛伴透过数泊的儿望。


In [19]:
def plotAccuray(history_record):
    """
    plot the accuracy and loss line. 
    :param history_record:
    :return:
    """
    accuracy_train = history_record.history["acc"]
    accuracy_val= history_record.history["val_acc"]
    loss_train = history_record.history["loss"]
    loss_val = history_record.history["val_loss"]
    epochs = range(len(accuracy_train))
    plt.plot(epochs, accuracy_train, 'bo', label='Training accuracy')
    plt.plot(epochs, accuracy_val, 'b', label='Validation accuracy')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.figure()
    plt.plot(epochs, loss_train, 'bo', label='Training loss')
    plt.plot(epochs, loss_val, 'b', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    plt.show()


In [None]:
# file_name = "./train_data/all_5.txt"
# cut_word_list = cutWords(file_name)
# word_to_index, index_to_word = mapWords(cut_word_list)
# X_train, X_val, y_train, y_val = generateTrainData(cut_word_list, word_to_index)
# history_record = model_lstm(X_train, X_val, y_train, y_val, word_to_index)