# import lib

In [None]:
# 参考例子https://github.com/sunriver/keras-word-char-embd/blob/master/keras_wc_embd/wrapper.py
# 使用Keras 函数式API搭建模型

# load data

# preprocess data

In [64]:
from collections import Counter
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import one_hot
import numpy as np

In [65]:

#step 0
sentences = [
    ['All', 'work', 'and', 'no', 'play'],
    ['makes', 'Jack', 'a', 'dull', 'boy'],
     ['Please', 'tell', 'me', 'how', 'to', 'play', 'this', 'game'],
]

# sentences_nums = len(sentences)


def onehot_and_pad_words(sentences : list):
    """
       sentences : list of sentence which consists of word
       #0所有句子->1.词汇集合vocab->2.词汇Index(word, index)->3.将每个句子转化成OneHot序列下标 ->4.序列对齐
    """
    #step1 词汇集合
    word_counts = Counter(word.lower() for sen in sentences for word in sen)
    vocab = [word for word, count in word_counts.items() if count > 0 ]

    #step2 词汇索引
    word2ids = dict((word,index) for index, word in enumerate(vocab))
#     print(word2ids)

    #step3 将每个句子转化成OneHot序列下标
    X_sentences = [[ word2ids.get(word, 1) for word in sentence] for sentence in sentences]
#     print(X_sentences)

    #4.Order Label向量对齐
    max_sentence_len = max(map(len, sentences))
    word_seqs = pad_sequences(X_sentences, max_sentence_len)
    
    sentences_num = len(sentences)
    return (sentences_num, max_sentence_len), np.asarray(word_seqs), len(vocab)



def onehot_and_pad_chars(sentences : list):
    #step1 创建char集合
    char_counts = Counter(char.lower() for sentence in sentences for word in sentence for char in word)
    print(char_counts)
    chars_set = [char for char, count in char_counts.items()]

    #step2 创建<Char，索引>字典
    char2ids = dict((word, index) for index, word in enumerate(chars_set))
    
    #step3 将句子映射为字符索引序列,由于是3维数据，所以不能用pad_sequence对齐，所以下面手动对齐
    sentences_num = len(sentences)
    sentences_max_len = max(map(len, sentences))
    words_max_len = max(len(word) for sentence in sentences for word in sentence)
#     print('sentences_max_len={}, words_max_len={}'.format(sentences_max_len, words_max_len))
    
    chars_seqs = [[[0] * words_max_len for i in range(sentences_max_len)] for j in range(sentences_num)]
#     chars_seqs = [[[char2ids.get(char.lower()) for char in word ] for word in sentence] for sentence in sentences]
#     display(chars_seqs)
    
    for i, sentence in enumerate(sentences):
        for j, word in enumerate(sentence):
            for k, char in enumerate(word):
                char_index = char2ids.get(char.lower())
#                 print("i={} sentence={}; j={} word={}; k={}, char={}. index={}".format(i, sentence, j, word, k, char, char_index))
                chars_seqs[i][j][k] = char_index
                
    
#     display(chars_seqs)
    return (sentences_num, sentences_max_len, words_max_len), np.asarray(chars_seqs), len(chars_set)



    

# define model

In [66]:


from keras.models import Model
from keras.layers import Embedding
from keras.layers import Softmax
from keras.layers import Bidirectional
from keras.layers import LSTMCell
from keras.layers import LSTM
from keras.layers import Input
from keras.layers import Concatenate
from keras.layers import TimeDistributed
from keras.layers import Dense
from keras.optimizers import Adam
from keras.losses import categorical_crossentropy
from keras.metrics import categorical_accuracy
from keras.losses import sparse_categorical_crossentropy
from keras.metrics import sparse_categorical_accuracy
from keras.callbacks import TensorBoard



In [70]:
#input_shape
def get_inputs_outputs_layer2(
     word_dict_len,
     doc_words_max_len,
     char_dict_len,
     word_chars_max_len
    ):
    
    input_word = Input(shape = (doc_words_max_len, ), name='input_word')
    layer_word_embed = Embedding(input_dim = word_dict_len, output_dim = 20, name="embed_word")(input_word)
        
    
    input_char = Input(shape=(doc_words_max_len, word_chars_max_len), name = 'input_char')
    char_embed_out_dim = 15
    layer_char_embed = Embedding(input_dim = char_dict_len, output_dim = char_embed_out_dim, name="embed_char")(input_char)     
    layer_char_bilstm = Bidirectional(LSTM(units = 15, input_shape=(doc_words_max_len, word_chars_max_len, char_embed_out_dim)))
    
    layer_char_timedistributed = TimeDistributed(layer=layer_char_bilstm, name = "time_distributed")(layer_char_embed)
        
    layer_concat = Concatenate(name = 'concate')([layer_word_embed, layer_char_timedistributed])
    #3dim -> 2dim， 从高维向低维过度
    layer_lstm2 = LSTM(units=5, name='LSTM2')(layer_concat)
    layer_softmax = Dense(units=2, activation='softmax', name='Softmax')(layer_lstm2)
    
    return [input_word, input_char], layer_softmax


#batch_shape
def get_inputs_outputs_layer(
     word_dict_len,
     doc_words_max_len,
     char_dict_len,
     word_chars_max_len
    ):

    input_word = Input(batch_shape = (None, doc_words_max_len), name='input_word')
    layer_word_embed = Embedding(input_dim = word_dict_len, output_dim = 20, name="embed_word")(input_word)
        
  
    
    input_char = Input(batch_shape=(None,  doc_words_max_len, word_chars_max_len), name = 'input_char')
    char_embed_out_dim = 15
    layer_char_embed = Embedding(input_dim = char_dict_len, output_dim = char_embed_out_dim, name="embed_char")(input_char)     
    layer_char_bilstm = Bidirectional(LSTM(units = 15, input_shape=(doc_words_max_len, word_chars_max_len, char_embed_out_dim)))
    
    layer_char_timedistributed = TimeDistributed(layer=layer_char_bilstm, name = "time_distributed")(layer_char_embed)
        
    layer_concat = Concatenate(name = 'concate')([layer_word_embed, layer_char_timedistributed])
    #3dim -> 2dim， 从高维向低维过度
    layer_lstm2 = LSTM(units=5, name='LSTM2')(layer_concat)
    layer_softmax = Dense(units=2, activation='softmax', name='Softmax')(layer_lstm2)
    
    return [input_word, input_char], layer_softmax


(sentences_num, max_sentence_len), word_seqs, word_dict_len = onehot_and_pad_words(sentences)

(sentences_num, sentences_max_len, words_max_len), chars_seqs, char_dict_len = onehot_and_pad_chars(sentences)

# inputs, outputs = get_inputs_outputs_layer(word_dict_len, max_sentence_len, char_dict_len, words_max_len)

inputs, outputs = get_inputs_outputs_layer2(word_dict_len, max_sentence_len, char_dict_len, words_max_len)

model = Model(inputs = inputs, outputs = outputs)

#注意这里设置的loss和metrics和softmax层对应的，softmax 输出两个值
#https://blog.csdn.net/qq_20011607/article/details/89213908
model.compile(optimizer = Adam(), loss = sparse_categorical_crossentropy, metrics=[sparse_categorical_accuracy])
model.summary()

    

Counter({'a': 9, 'l': 9, 'e': 6, 'o': 5, 'k': 3, 'p': 3, 'y': 3, 'm': 3, 's': 3, 't': 3, 'w': 2, 'n': 2, 'd': 2, 'h': 2, 'r': 1, 'j': 1, 'c': 1, 'u': 1, 'b': 1, 'i': 1, 'g': 1})
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_char (InputLayer)         (None, 8, 6)         0                                            
__________________________________________________________________________________________________
input_word (InputLayer)         (None, 8)            0                                            
__________________________________________________________________________________________________
embed_char (Embedding)          (None, 8, 6, 15)     315         input_char[0][0]                 
__________________________________________________________________________________________________
embed_word (Embedding)        

In [71]:
word_seqs = np.asarray(word_seqs)
chars_seqs = np.asarray(chars_seqs)
labels = np.asarray([ 0, 1, 1])

def batch_generator():
    while True:
   
        yield [word_seqs, chars_seqs], labels


def get_callbacks():
    cb_borad = TensorBoard(log_dir='./logs',  # log 目录
                 histogram_freq=0,  # 按照何等频率（epoch）来计算直方图，0为不计算
#                  batch_size=32,     # 用多大量的数据计算直方图
                 write_graph=True,  # 是否存储网络结构图
                 write_grads=True, # 是否可视化梯度直方图
                 write_images=True,# 是否可视化参数
                 embeddings_freq=0, 
                 embeddings_layer_names=None, 
                 embeddings_metadata=None)
    return [cb_borad]
    
model.fit_generator(
    generator=batch_generator(),
    steps_per_epoch=200,
    epochs=1,
    callbacks= get_callbacks()
)

# model.fit(np.asarray([np.asarray(word_seqs), np.asarray(chars_seqs)]), np.asarray([0, 1]),
#           steps_per_epoch=200,
#           epochs=1
#          )

Epoch 1/1


<keras.callbacks.History at 0x14fcbe0f0>