# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Data-Preprocessing" data-toc-modified-id="Data-Preprocessing-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Data Preprocessing</a></div><div class="lev2 toc-item"><a href="#Participle" data-toc-modified-id="Participle-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Participle</a></div><div class="lev2 toc-item"><a href="#Word-to-Label" data-toc-modified-id="Word-to-Label-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Word to Label</a></div><div class="lev2 toc-item"><a href="#Word-to-Vector" data-toc-modified-id="Word-to-Vector-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Word to Vector</a></div><div class="lev3 toc-item"><a href="#Train" data-toc-modified-id="Train-131"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Train</a></div><div class="lev3 toc-item"><a href="#Test" data-toc-modified-id="Test-132"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Test</a></div><div class="lev1 toc-item"><a href="#Model" data-toc-modified-id="Model-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Model</a></div><div class="lev2 toc-item"><a href="#Set-Hyperparameters" data-toc-modified-id="Set-Hyperparameters-21"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Set Hyperparameters</a></div><div class="lev2 toc-item"><a href="#Custom-Metrics" data-toc-modified-id="Custom-Metrics-22"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Custom Metrics</a></div><div class="lev2 toc-item"><a href="#Builde-Graph" data-toc-modified-id="Builde-Graph-23"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Builde Graph</a></div><div class="lev2 toc-item"><a href="#Data-Generator" data-toc-modified-id="Data-Generator-24"><span class="toc-item-num">2.4&nbsp;&nbsp;</span>Data Generator</a></div><div class="lev2 toc-item"><a href="#Split-Data" data-toc-modified-id="Split-Data-25"><span class="toc-item-num">2.5&nbsp;&nbsp;</span>Split Data</a></div><div class="lev2 toc-item"><a href="#Train" data-toc-modified-id="Train-26"><span class="toc-item-num">2.6&nbsp;&nbsp;</span>Train</a></div><div class="lev2 toc-item"><a href="#Predict" data-toc-modified-id="Predict-27"><span class="toc-item-num">2.7&nbsp;&nbsp;</span>Predict</a></div>

# Data Preprocessing

In [1]:
import numpy as np
import pickle
import jieba
import re

In [2]:
with open('content.pkl', 'rb') as fp:
    content = pickle.load(fp)
with open('core-entity.pkl', 'rb') as fp:
    core_eneity = pickle.load(fp)

In [3]:
content[6:12]

['《醉后一夜》把一对青年男女的相识相爱当悬疑故事来讲是个很好玩的事情，可是导演故事没讲话，玩花哨也没给影片带来多少特别讨喜的内容，只能算是平庸之作。',
 '《醉后决定爱上你》是由台湾梦田文创出品，是《命中注定我爱你》三部曲中的第二部，由陈铭章执导，杨丞琳、张孝全、王传一、  许玮甯等主演。',
 '非常漂亮的一个江南园林，乘坐地铁9号线直接到醉白池。',
 '醉白池乍一听很像最白痴，是江南园林的代表之一。',
 '《罪恶城市》和《Original  Gangstaz》一样是一款黑帮主题的角色扮演类游戏，虽然主题相同，但《罪恶城市》充分认识到了《Original  Gangstaz》的不足之处，改变其简单枯燥的游戏方式，加入法姆维尔风格地图及更精致的人物造型等等。',
 '专辑主打歌‘最熟悉的陌生人’是一首带有台式复古风味的抒情歌曲，MTV选在台中一家别具古味的餐厅拍摄，整个餐厅里头就是一条30年代的台湾老街，很符合主打歌既熟悉又陌生的感觉，MTV的剧情安排王介安经由时光隧道回到过去，看着街上的男男女女，彼此都有似曾相识的感觉，唱片公司老板林秋离更是首次粉墨登场，扮一个马夫酒鬼，负责在一旁抽烟喝酒。']

In [4]:
core_eneity[6:12]

[['醉后一夜'],
 ['醉后决定爱上你'],
 [],
 ['醉白池'],
 ['罪恶城市', 'Original  Gangstaz'],
 ['最熟悉的陌生人']]

In [5]:
print('评论平均长度为：%d' % (int(round(np.mean([len(i) for i in content])))))

评论平均长度为：54


In [6]:
MAXLEN = 80
EMBEDDING_SIZE = 128

## Participle

In [7]:
not_cuts = re.compile(u'([\da-zA-Z \.]+)|《(.*?)》|“(.{1,10})”')
re_replace = re.compile(u'[^\u4e00-\u9fa50-9a-zA-Z《》\(\)（）“”·\.]')

In [8]:
def newcut(s):
    """
    修改原分词函数:
    1: 英文和数字部分不分词 
    2: 双书名号中内容不分词
    3: 双引号中十字以内内容不分词
    4: 超出范围的字符均替换为空格
    5: 使用结巴分词(关闭新词发现功能)
    """
    result = []
    j = 0
    s = re_replace.sub(' ', s)
    for i in not_cuts.finditer(s):
        result.extend(jieba.lcut(s[j:i.start()], HMM=False))
        if s[i.start()] in [u'《', u'“']:
            result.extend([s[i.start()], s[i.start()+1:i.end()-1], s[i.end()-1]])
        else:
            result.append(s[i.start():i.end()])
        j = i.end()
    result.extend(jieba.lcut(s[j:], HMM=False))
    return result

In [9]:
words = [newcut(s) for s in content]

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/kz/hqjl_dfx3g3_2vxylxlj1s940000gn/T/jieba.cache
Loading model cost 1.312 seconds.
Prefix dict has been built succesfully.


## Word to Label

In [13]:
def word2label(n, word):
    """
    将输出结果转换为标签序列
    5tag:
    0: 非核心实体 
    1: 单词核心实体
    2: 多词核心实体首词
    3: 多词核心实体中间部分
    4: 多词核心实体末词
    """
    seq_word = word
    s = seq_word[n]
    r = ['0']*len(s)
    for i in range(len(s)):
        for j in core_eneity[n]:
            if s[i] in j:
                r[i] = '1'
                break
    s = ''.join(r)
    r = [0]*len(s)
    for i in re.finditer('1+', s):
        if i.end() - i.start() > 1:
            r[i.start()] = 2
            r[i.end()-1] = 4
            for j in range(i.start()+1, i.end()-1):
                r[j] = 3
        else:
            r[i.start()] = 1
    return r

In [14]:
labels = [word2label(i, words) for i in range(len(words))]

In [15]:
#随机打乱数据
from sklearn.model_selection import train_test_split
words, _, labels, _ = train_test_split(words, labels, test_size=0., random_state=42)
len(words), len(labels)

(12445, 12445)

## Word to Vector

### Train

In [16]:
import gensim
import logging

Using TensorFlow backend.


In [17]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', 
                    level=logging.INFO)
word2vec = gensim.models.Word2Vec(np.array(words),
                                  min_count=1,
                                  size=EMBEDDING_SIZE,
                                  workers=20,
                                  iter=20,
                                  window=8,
                                  negative=8,
                                  sg=1)
word2vec.save('word2vec.model')
#预先归一化
word2vec.init_sims(replace=True)

2017-07-31 19:57:37,187 : INFO : collecting all words and their counts
2017-07-31 19:57:37,189 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-07-31 19:57:37,275 : INFO : PROGRESS: at sentence #10000, processed 314194 words, keeping 37037 word types
2017-07-31 19:57:37,294 : INFO : collected 42029 word types from a corpus of 390502 raw words and 12445 sentences
2017-07-31 19:57:37,295 : INFO : Loading a fresh vocabulary
2017-07-31 19:57:37,395 : INFO : min_count=1 retains 42029 unique words (100% of original 42029, drops 0)
2017-07-31 19:57:37,395 : INFO : min_count=1 leaves 390502 word corpus (100% of original 390502, drops 0)
2017-07-31 19:57:37,513 : INFO : deleting the raw counts dictionary of 42029 items
2017-07-31 19:57:37,515 : INFO : sample=0.001 downsamples 23 most-common words
2017-07-31 19:57:37,516 : INFO : downsampling leaves estimated 303299 word corpus (77.7% of prior 390502)
2017-07-31 19:57:37,517 : INFO : estimated required memory for 4

### Test 

In [18]:
model = gensim.models.Word2Vec.load('word2vec.model')

2017-07-31 19:58:14,532 : INFO : loading Word2Vec object from word2vec.model
2017-07-31 19:58:15,078 : INFO : loading wv recursively from word2vec.model.wv.* with mmap=None
2017-07-31 19:58:15,079 : INFO : setting ignored attribute syn0norm to None
2017-07-31 19:58:15,080 : INFO : setting ignored attribute cum_table to None
2017-07-31 19:58:15,082 : INFO : loaded word2vec.model


In [19]:
model.most_similar('地铁')

2017-07-31 19:58:15,192 : INFO : precomputing L2-norms of word weight vectors


[('开通', 0.7537972927093506),
 ('第二条', 0.750749945640564),
 ('公交', 0.7351782321929932),
 ('打车', 0.7187823057174683),
 ('地下铁路', 0.7162506580352783),
 ('白楼', 0.7160401344299316),
 ('坐地铁', 0.7159051895141602),
 ('B C', 0.7105467319488525),
 ('线', 0.7093311548233032),
 ('直达', 0.7091985940933228)]

# Model

##  Set Hyperparameters

In [24]:
NUM_EPOCHS = 256
BATCH_SIZE = 1024
L1_FACTOR = 0.01 #通过L1正则项，使得输出更加稀疏

## Custom Metrics

In [25]:
import tensorflow as tf
import keras.backend as K

In [49]:
def micro_precision(y_true, y_pred):
    """
    计算 micro-averaged precision
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    micro_precision = true_positives / (predicted_positives + K.epsilon())
    return micro_precision

def macro_precision(y_true, y_pred):
    """
    计算 macro-averaged precision
    """
    true_positives = tf.reduce_sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=[0, 1])
    predicted_positives = tf.reduce_sum(K.round(K.clip(y_pred, 0, 1)), axis=[0, 1])
    macro_precision = K.sum(true_positives / (predicted_positives + K.epsilon())) / 5
    return macro_precision

def micro_recall(y_true, y_pred):
    """
    计算 micro-averaged recall
    """
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    micro_recall = true_positives / (possible_positives + K.epsilon())
    return micro_recall

def macro_recall(y_true, y_pred):
    """
    计算 macro-averaged recall
    """
    true_positives = tf.reduce_sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=[0, 1])
    possible_positives = tf.reduce_sum(K.round(K.clip(y_true, 0, 1)), axis=[0, 1])
    macro_recall = K.sum(true_positives / (possible_positives + K.epsilon())) / 5
    return macro_recall

def macro_f1_score(y_true, y_pred):
    """
    计算 macro-averaged f1-score
    """
    
    c1 = tf.reduce_sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=[0, 1])
    c2 = tf.reduce_sum(K.round(K.clip(y_pred, 0, 1)), axis=[0, 1])
    c3 = tf.reduce_sum(K.round(K.clip(y_true, 0, 1)), axis=[0, 1])
    c = K.int_shape(c1)[-1]

    if c3 == K.zeros((c,)):
        return 0
    precision = c1 / (c2 + K.epsilon())
    recall = c1 / (c3 + K.epsilon())
    macro_f1_score = 2 / c * K.sum((precision * recall) / (precision + recall + K.epsilon()))
    return macro_f1_score

## Builde Graph

In [50]:
from keras.layers import Dense, LSTM, TimeDistributed, Input, Masking, Bidirectional 
from keras.models import Model
from keras.utils import np_utils
from keras.regularizers import l1 

In [51]:
sequence = Input(shape=(MAXLEN, EMBEDDING_SIZE))
mask = Masking(mask_value=0.)(sequence)
blstm = Bidirectional(LSTM(64, return_sequences=True), merge_mode='sum')(mask)
blstm = Bidirectional(LSTM(32, return_sequences=True), merge_mode='sum')(blstm)
output = TimeDistributed(Dense(5, activation='softmax', activity_regularizer=l1(0.01)))(blstm)
model = Model(inputs=sequence, outputs=output)

In [52]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[macro_f1_score], sample_weight_mode='temporal')

## Data Generator

In [30]:
def gen_x(z):
    """
    从分词后的list中输出训练样本
    超过MAXLEN则截断，不足补0
    """
    gen = np.vstack((word2vec[z[:MAXLEN]], np.zeros((MAXLEN-len(z[:MAXLEN]), EMBEDDING_SIZE))))
    return gen #80x128

In [31]:
def gen_y(z):
    """
    将输出序列转换为独热码
    超过MAXLEN则截断，不足补0
    """
    gen = np_utils.to_categorical(np.array(z[:MAXLEN] + [0]*(MAXLEN-len(z[:MAXLEN]))), 5)
    return gen #80x5

In [32]:
def data_generator(data, targets, batch_size):
    """
    为节省内存使用生成器
    """
    t = 0
    while True:
        if t >= len(data): t = 0
        x = np.zeros((batch_size, MAXLEN, EMBEDDING_SIZE))
        y = np.zeros((batch_size, MAXLEN, 5))
        for i in range(batch_size):
            n = i + t
            if n > len(data)-1:
                break
            x[i, :, :] = gen_x(data[n])
            y[i, :, :] = gen_y(targets[n])
        t += batch_size
        yield (x, y)

## Split Data

In [33]:
Xtrain, Xval, Ytrain, Yval = train_test_split(words, labels, test_size=0.3, random_state=42)
Xval, Xtest, Yval, Ytest = train_test_split(Xval, Yval, test_size=0.5, random_state=42)
len(Xtrain), len(Xval), len(Xtest)

(8711, 1867, 1867)

In [34]:
xt = np.array([gen_x(i) for i in Xtrain])
yt = np.array([gen_y(i) for i in Ytrain])
xv = np.array([gen_x(i) for i in Xval])
yv = np.array([gen_y(i) for i in Yval])
xte = np.array([gen_x(i) for i in Xtest])
yte = np.array([gen_y(i) for i in Ytest])

In [35]:
yt.shape

(8711, 80, 5)

In [36]:
Xtrain, Xval, Ytrain, Yval = train_test_split(words, labels, test_size=0.15, random_state=42)
xt = np.array([gen_x(i) for i in Xtrain])
yt = np.array([gen_y(i) for i in Ytrain])
xv = np.array([gen_x(i) for i in Xval])
yv = np.array([gen_y(i) for i in Yval])

In [37]:
import h5py
fh = h5py.File('test_ner_data.h5', 'w')
fh.create_dataset('train_ner_x', data=xt)
fh.create_dataset('train_ner_y', data=yt)
fh.create_dataset('val_ner_x', data=xv)
fh.create_dataset('val_ner_y', data=yv)
fh.close()

## Train

In [38]:
import numpy as np
from sklearn.metrics import f1_score

In [39]:
def macro_f1_score(x, y_true):
    """
    计算 macro-averaged f1-score
    """
    x = np.array([gen_x(x[i]) for i in range(len(x))])
    y_true = np.array([gen_y(y_true[i]) for i in range(len(y_true))])
    y_pred = model.predict(x)
    y_true = np.reshape(np.array([t for t in np.argmax(y_true, -1)]), -1)
    y_pred = np.reshape(np.array([t for t in np.argmax(y_pred, -1)]), -1)
    return f1_score(y_true, y_pred, average='macro')

In [38]:
#gen_train = data_generator(Xtrain, Ytrain, BATCH_SIZE)
#gen_val = data_generator(Xval, Yval, BATCH_SIZE)

In [36]:
#from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

In [37]:
#tensorboard = TensorBoard(log_dir='tb_logs/1', histogram_freq=1, write_graph=True, write_images=True)
#filepath = 'cp_logs/weights.{epoch:03d}-{val_macro_f1_score:.12f}.hdf5'
#checkpoint = ModelCheckpoint(filepath, monitor='val_macro_f1_score', verbose=0, save_best_only=True)
#earlystopping = EarlyStopping(monitor='val_loss', verbose=1, patience=2)
#reducelr = ReduceLROnPlateau(monitor='val_loss', verbose=1, factor=0.8, patience=3, min_lr=0.0001)

In [41]:
#h = model.fit_generator(gen_train, 
#                        steps_per_epoch=len(Xtrain)//BATCH_SIZE, 
#                        epochs=10, 
#                        validation_data=gen_val, 
#                        validation_steps=len(Xval)//BATCH_SIZE,
#                        verbose=1,
#                        callbacks=[tensorboard, checkpoint])

In [75]:
sample_weight = np.array([i for i in range(1, 81)][::-1])

In [76]:
for a in range(10577):
    sample_weight=np.vstack((np.array([i for i in range(1, 81)][::-1]), sample_weight))

In [77]:
sample_weight.shape

(10578, 80)

In [78]:
F1_score = []
for e in range(NUM_EPOCHS):    
    print('EPOCHS', e+1)
    h = model.fit(x=xt, y=yt, batch_size=BATCH_SIZE, epochs=1, validation_data=(xv, yv), sample_weight=sample_weight)
    F1_score.append(macro_f1_score(Xval, Yval))
    print('macro-averaged f1-score = %.6f' % (macro_f1_score(Xval, Yval)))  

EPOCHS 1
Train on 10578 samples, validate on 1867 samples
Epoch 1/1

KeyboardInterrupt: 

In [106]:
model.fit(x=xt, y=yt, batch_size=BATCH_SIZE, epochs=NUM_EPOCHS, validation_data=(xv, yv))

Train on 8711 samples, validate on 1867 samples
Epoch 1/256
Epoch 2/256


KeyboardInterrupt: 

In [42]:
#F1_score = []
#for e in range(NUM_EPOCHS):    
#    print('EPOCHS', e+1)
#    h = model.fit_generator(gen_train, 
#                            steps_per_epoch=len(Xtrain)//BATCH_SIZE, 
#                            epochs=1, 
#                            validation_data=gen_val, 
#                            validation_steps=len(Xval)//BATCH_SIZE,
#                            verbose=1)
#    F1_score.append(macro_f1_score(Xval, Yval))
#    print('macro-averaged f1-score = %.6f' % (macro_f1_score(Xval, Yval)))  

In [43]:
#model.save_weights('train_1.model')

## Predict

In [44]:
#def predict_data(data, batch_size):
#    """
#    输出预测结果（原始数据，未整理）
#    """
#    data = np.array(data)
#    batches = [range(batch_size*i, min(len(data), batch_size*(i+1))) for i in range(len(data)//batch_size)]
#    p = model.predict(np.array(list(map(gen_x, data[batches[0]]))), verbose=1)
#    for i in batches[1:]:
#        print(min(i), 'done.')
#        p = np.vstack((p, model.predict(np.array(list(map(gen_x, data[i]))), verbose=1)))
#    return p

In [45]:
# predict = predict_data(Xtest, BATCH_SIZE)