In [1]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from collections import Counter
import os

In [2]:
os.environ['CUDA_VISIBLE_DEVICES'] = '3'

###  语料处理

In [3]:
# 没有找到人民日报2014语料
# 使用msr语料

# 使用bmes标记

char_counter = Counter()
texts = []
tags = []

stops = u'，。！？；、：,\.!\?;:\n'
for line in tqdm(open("/home/xueyou/tmp/icwb2-data/training/msr_training.utf8")):
    txt = [i.strip() for i in re.split('['+stops+']',line) if i.strip()]
    for t in txt:
        texts.append("")
        tags.append('')
        for w in re.split(" +",t):
            texts[-1] += w
            char_counter.update(list(w))
            if len(w) == 1:
                tags[-1] += 's'
            else:
                tags[-1] += 'b' + 'm'*(len(w)-2) + 'e'    

86924it [00:19, 4398.16it/s]


In [4]:
texts[-1]

'响起在农村大地上的钟声－－－看电视纪录片《村民的选择》'

In [5]:
tags[-1]

'sssbebessbessssbebmesbesbes'

In [6]:
len(texts)

314148

In [7]:
word2id = {'<unk>':0,'<pad>':1}
min_count = 2
for w,cnt in char_counter.most_common():
    if cnt >= min_count:
        word2id[w] = len(word2id)
    else:
        break

In [8]:
len(word2id)

4728

In [38]:
target_word2id = {"b":0,"m":1,"e":2,"s":3}

In [10]:
from scipy import stats
stats.describe([len(s) for s in texts])

DescribeResult(nobs=314148, minmax=(1, 581), mean=11.908431694615278, variance=93.847848128829185, skewness=20.03770976920095, kurtosis=1081.4968389673104)

In [97]:
from sklearn.utils import shuffle

def pad(s,max_len,pad_char):
        s = s[:max_len]
        s = s + [pad_char] * (max_len-len(s))
        return s
    
def data_iter(batch_size,max_len):
    global texts,tags
    x = []
    y = []
    
    ltexts,ltags = shuffle(texts,tags)
    
    max_batch = -1
    for text,tag in zip(ltexts,ltags):
        if len(x) == batch_size:
            
            max_len = min(max_len,max_batch)
            max_batch = -1
            tmp_x,tmp_y=[],[]
            for text,tag in zip(x,y):
                text = pad(text,max_len,1)
                tag = pad(tag,max_len,3)
                tmp_x.append(text)
                tmp_y.append(tag)
            yield tmp_x,tmp_y
            
            x=[]
            y=[]

        text = [word2id.get(t,0) for t in text]
        tag = [target_word2id.get(t,3) for t in tag]
        max_batch = max(max_batch,len(text))
            
        x.append(text)
        y.append(tag)
        

### 定义和训练模型

In [12]:
import tensorflow as tf

In [13]:
tf.__version__

'1.3.0'

In [179]:
class Config():
    embedding_size = 128
    keep_prob = 0.5
    max_len = 80
    batch_size = 1024
    src_vocab_size = len(word2id)
    tgt_vocab_size = len(target_word2id)
    train = True

In [197]:
class CNN_Model(object):
    def __init__(self,config):
        self.config = config
        self.build_model()
    
    def build_model(self):
        embedding_size = self.config.embedding_size
        src_vocab_size = self.config.src_vocab_size
        tgt_vocab_size = self.config.tgt_vocab_size
        train = self.config.train
        
        keep_prob_p = tf.placeholder(tf.float32)
        x = tf.placeholder(tf.int32,[None,None])
        batch_size = tf.shape(x)[0]
        input_embedding = tf.Variable(tf.random_uniform([src_vocab_size,embedding_size],-1,1),name="embedding")
        inputs_e = tf.nn.embedding_lookup(input_embedding,x)
        inputs_drop_out = tf.nn.dropout(inputs_e,keep_prob_p)

        conv1 = tf.layers.conv1d(inputs_drop_out,filters=embedding_size,kernel_size=3,padding="SAME",activation=tf.nn.relu)
        conv1 = tf.nn.dropout(conv1,keep_prob_p)
        conv2 = tf.layers.conv1d(conv1,filters=embedding_size/2,kernel_size=3,padding="SAME",activation=tf.nn.relu)
        conv2 = tf.nn.dropout(conv2,keep_prob_p)
        conv3 = tf.layers.conv1d(conv1,filters=tgt_vocab_size,kernel_size=3,padding="SAME")

        self.x = x
        self.logits = tf.nn.softmax(conv3)
        self.keep_prob = keep_prob_p
        
        if train:
            y = tf.placeholder(tf.int32, shape=[None,None])
            crossent = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=conv3)
            losses = tf.reduce_sum(
                crossent) / tf.to_float(batch_size)
            self.train_step = tf.train.AdamOptimizer().minimize(losses)
            correct_prediction = tf.equal(tf.argmax(conv3, 2), tf.cast(y,dtype=tf.int64))
            self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
            self.y = y
            self.losses = losses
    
    def train_one_step(self,sess,x,y,keep_prob):
        _,loss = sess.run([self.train_step,self.losses],feed_dict={self.x:x,self.y:y,self.keep_prob:keep_prob})
        return loss
        
    def get_accuracy(self,sess,x,y):
        acc = sess.run(self.accuracy,feed_dict={self.x:x,self.y:y,self.keep_prob:1.0})
        return acc
    
    def predict(self,sess,x):
        logits = sess.run(self.logits,feed_dict={self.x:x,self.keep_prob:1.0})
        return logits

In [198]:
# Train
tf.reset_default_graph()

with tf.Session() as sess:
    config = Config()
    model = CNN_Model(config)
    sess.run(tf.global_variables_initializer())
    batches = int(len(texts)/config.batch_size)
    saver = tf.train.Saver()

    for i in range(5):
        accs = []
        losses = []
        cnt = 0
        for xx,yy in data_iter(config.batch_size,config.max_len):
            xx,yy = np.asarray(xx),np.asarray(yy)
            acc = model.get_accuracy(sess,xx,yy)
            accs.append(acc)
            loss = model.train_one_step(sess,xx,yy,config.keep_prob)
            losses.append(loss)
            print('\rEpcho %s, Accuracy: %s, Loss: %s, batches: %s/%s'%(i+1, acc, loss, cnt, batches),end='')
            cnt += 1

        print("")
        print('Epcho %s Mean Accuracy: %s, Mean Loss: %s'%(i+1, np.mean(accs),np.mean(losses)))
        saver.save(sess, '/home/xueyou/tmp/cnn_cuts/cnn_cut.ckpt')


Epcho 1, Accuracy: 0.948003, Loss: 8.67332, batches: 305/306
Epcho 1 Mean Accuracy: 0.920798, Mean Loss: 12.5983
Epcho 2, Accuracy: 0.955172, Loss: 7.77253, batches: 305/306
Epcho 2 Mean Accuracy: 0.954257, Mean Loss: 8.04857
Epcho 3, Accuracy: 0.973523, Loss: 7.03106, batches: 305/306
Epcho 3 Mean Accuracy: 0.973355, Mean Loss: 7.11379
Epcho 4, Accuracy: 0.965275, Loss: 6.26814, batches: 305/306
Epcho 4 Mean Accuracy: 0.962058, Mean Loss: 6.59512
Epcho 5, Accuracy: 0.95981, Loss: 5.91445, batches: 305/306
Epcho 5 Mean Accuracy: 0.95878, Mean Loss: 6.22576


### 解码

In [199]:
states = ['b','m','e','s']

# use trans prob from jieba
trans_proba={'b': {'e': -0.510825623765990, 'm': -0.916290731874155},
             'e': {'b': -0.5897149736854513, 's': -0.8085250474669937},
             'm': {'e': -0.33344856811948514, 'm': -1.2603623820268226},
             's': {'b': -0.7211965654669841, 's': -0.6658631448798212}}

start_proba={'b': -0.26268660809250016,
             'e': -3.14e+100,
             'm': -3.14e+100,
             's': -1.4652633398537678}

PrevStatus = {
    'b': 'es',
    'm': 'mb',
    's': 'se',
    'e': 'bm'
}

def veterbi(probs):
    V = [{}]
    path = {}
    for i,y in enumerate(states):
        V[0][y] = np.log(probs[0][i]) + start_proba[y]
        path[y] = [y]
    for t in range(1,len(probs)):
        V.append({})
        new_path = {}
        for j,y in enumerate(states):
            emit = np.log(probs[t][j])
            prob,state = max((V[t-1][y0] + trans_proba[y0][y] + emit,y0)  for y0 in PrevStatus[y])
            V[t][y] = prob
            new_path[y] = path[state] + [y]
        path = new_path
    prob, state = max((V[len(probs) - 1][y], y) for y in 'es')
    return prob,path[state]
   
def cut(sentence,path):
    words = [sentence[0]]
    for i in range(1,len(sentence)):
        if path[i] in ['s','b']:
            words.append(sentence[i])
        else:
            words[-1] += sentence[i]
    return words

In [200]:
# Inference
tf.reset_default_graph()

with tf.Session() as sess:

    config = Config()
    config.train=False
    model = CNN_Model(config)
    saver = tf.train.Saver()

    saver.restore(sess,tf.train.latest_checkpoint('/home/xueyou/tmp/cnn_cuts/'))
    
    raw_text = '我今天不开心。'
    text = [word2id.get(t,0) for t in raw_text]
    text = pad(text,len(text),1)
    x = np.reshape(np.asarray(text),[1,-1])
    p = model.predict(sess,x)[0]
    _,path = veterbi(p)
    words = cut(raw_text,path)
    print(" ".join(words))

INFO:tensorflow:Restoring parameters from /home/xueyou/tmp/cnn_cuts/cnn_cut.ckpt
我 今天 不 开心 。


In [206]:
# Inference
tf.reset_default_graph()

with tf.Session() as sess:

    config = Config()
    config.train=False
    model = CNN_Model(config)
    saver = tf.train.Saver()

    saver.restore(sess,tf.train.latest_checkpoint('/home/xueyou/tmp/cnn_cuts/'))
    
    test_texts = open("/home/xueyou/tmp/icwb2-data/testing/pku_test.utf8").read().split("\n")[:-1]
    with open("/home/xueyou/tmp/icwb2-data/testing/pku_cnn_cut.txt",'w') as f:    
        for raw_text in test_texts:
            if raw_text:
                text = [word2id.get(t,0) for t in raw_text]
                text = pad(text,len(text),1)
                x = np.reshape(np.asarray(text),[1,-1])
                p = model.predict(sess,x)[0]
                _,path = veterbi(p)
                words = cut(raw_text,path)
                f.write(" ".join(words) + "\n")
            

INFO:tensorflow:Restoring parameters from /home/xueyou/tmp/cnn_cuts/cnn_cut.ckpt


### 分析和思考
- 利用score代码测试准确率，只有82%，recall是83%
    - 现在这个模型没有经过充分的优化，不知道原作者为什么可以有93%的准确率
    - 我通过修改模型结构，如增加kernel size，提高了准确率。这个模型本身是不复杂的，因此可以提高模型复杂度来提高精度
    - 可以利用S2S+Attention模型来做这个事情，效果肯定会更好
- 文章还提到硬解码的问题，思路比较简单，只是在预测出来的标签概率上乘以对应的放大数值，以便在解码时提高单词的概率