### 基于深度学习的中文分词尝试
- 基于word2vec + 神经网络进行中文分词
    - 步骤1：使用的是sogou的语料库建立初始的字向量。
    - 步骤2：读入有标注的训练语料库，处理成keras需要的数据格式。
    - 步骤3：根据训练数据建模，使用CNN方法
    - 步骤4：读入无标注的检验语料库，用CNN模型进行分词标注
    - 步骤5：检查最终的效果
- 参考资料:[中文分词资源](http://www.52nlp.cn/%E4%B8%AD%E6%96%87%E5%88%86%E8%AF%8D%E5%85%A5%E9%97%A8%E4%B9%8B%E8%B5%84%E6%BA%90) 
    [中文分词标注法](http://www.52nlp.cn/the-character-based-tagging-method-of-chinese-word-segmentation) [word2vec原理](http://suanfazu.com/t/word2vec-zhong-de-shu-xue-yuan-li-xiang-jie-duo-tu-wifixia-yue-du/178) [基于word2vec的中文分词](http://blog.csdn.net/itplus/article/details/17122431)

- 步骤1：先用sogou语料库生成中文的单字向量，以备后用

In [1]:
from os import path
import os
import re

In [2]:
rootdir = '/SogouC.reduced/Reduced'
dirs = os.listdir(rootdir)
dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]

In [3]:
def load_txt(x):
    with open(x) as f:
        res = [t.decode('gbk','ignore') for t in f]
        return ''.join(res)

In [4]:
text_t = {}
for i, d in enumerate(dirs):
    files = os.listdir(d)
    files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]
    text_t[i] = [load_txt(f) for f in files]

In [12]:
# to dataframe
import pandas as pd
import numpy as np

In [6]:
flen = [len(t) for t in text_t.values()]

In [7]:
labels = np.repeat(text_t.keys(),flen)

In [8]:
# flatter nested list
import itertools
merged = list(itertools.chain.from_iterable(text_t.values()))

In [9]:
df = pd.DataFrame({'label': labels, 'txt': merged})
df.head()

Unnamed: 0,label,txt
0,0,本报记者陈雪频实习记者唐翔发自上海\r\n 一家刚刚成立两年的网络支付公司，它的目标是...
1,0,证券通：百联股份未来5年有能力保持高速增长\r\n\r\n 深度报告 权威内参...
2,0,5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www....
3,0,5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www....
4,0,5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www....


In [10]:
# cut character
def cutchar(x):
    words = list(x)
    return ' '.join(words)

In [14]:
df['seg_word'] = df.txt.map(cutchar)

In [7]:
from cPickle import dump,load
#dump(df, open('df.pickle', 'wb'))
df = load(open('df.pickle','rb'))

In [8]:
# 探索 转成nltk需要格式,建立list 
txt = df['seg_word'].values 
txtnltk = [] 
for sent in txt: 
    temp = [w for w in sent.split()] 
    txtnltk.extend(temp) 

In [13]:
# nltk  
import nltk 
corpus = nltk.Text(txtnltk) 
from nltk.probability import FreqDist 
# 词频 
fdist = FreqDist(corpus) 
w = fdist.keys() 
v = fdist.values() 
freqdf = pd.DataFrame({'word':w,'freq':v}) 
freqdf.sort('freq',ascending =False, inplace=True)
freqdf['idx'] = np.arange(len(v))
freqdf.head()

Unnamed: 0,freq,word,idx
2,4327816,�,0
5981,899948,，,1
2658,709992,的,2
6288,417699,。,3
4272,227591,一,4


In [14]:
word2idx = dict((c, i) for c, i in zip(freqdf.word, freqdf.idx))
idx2word = dict((i, c) for c, i in zip(freqdf.word, freqdf.idx))

In [23]:
# 保持字符串，为生成词向量准备
all_news_wv = []
for news in txt:
    all_news_wv.append([x for x in news.split() ])

In [24]:
# word2vec
from gensim.models import word2vec
def trainW2V(corpus, epochs=50, num_features = 100,\
             min_word_count = 1, num_workers = 4,\
             context = 10, sample = 1e-5):
    global w2v
    w2v = word2vec.Word2Vec(workers = num_workers,
                          sample = sample,
                          size = num_features,
                          min_count=min_word_count,
                          window = context)
    np.random.shuffle(corpus)
    w2v.build_vocab(corpus)  
    for epoch in range(epochs):
        print epoch, 
        np.random.shuffle(corpus)
        w2v.train(corpus)
        w2v.alpha *= 0.9  
        w2v.min_alpha = w2v.alpha  
    print "Done."

In [25]:
# word2vec
trainW2V(all_news_wv)

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 Done.


In [26]:
# 保存词向量lookup矩阵，按idx位置存放。目的是保存词频，也可以直接使用w2v.index2word
init_weight_wv = []
for i in range(freqdf.shape[0]):
    init_weight_wv.append(w2v[idx2word[i]])

In [15]:
from cPickle import dump,load
#dump(init_weight_wv, open('init_weight.pickle', 'wb'))
init_weight_wv= load(open('init_weight.pickle','rb'))

- 步骤2：训练数据读取和转换

In [20]:
# 读取数据，将格式进行转换为带四种标签 S B M E
input_file = 'icwb2-data/training/msr_training.utf8'
output_file = 'icwb2-data/training/msr_training.tagging.utf8'

In [37]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: 52nlpcn@gmail.com
# Copyright 2014 @ YuZhen Technology
#
# 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)

import codecs
import sys

def character_tagging(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
    for line in input_data.readlines():
        word_list = line.strip().split()
        for word in word_list:
            if len(word) == 1:
                output_data.write(word + "/S ")
            else:
                output_data.write(word[0] + "/B ")
                for w in word[1:len(word)-1]:
                    output_data.write(w + "/M ")
                output_data.write(word[len(word)-1] + "/E ")
        output_data.write("\n")
    input_data.close()
    output_data.close()

character_tagging(input_file, output_file)
# if __name__ == '__main__':
#     if len(sys.argv) != 3:
#         print "Please use: python character_tagging.py input output"
#         sys.exit()
#     input_file = sys.argv[1]
#     output_file = sys.argv[2]
#     character_tagging(input_file, output_file)

In [16]:
# 定义'U'为未登陆新字, 空格为两头padding用途，并增加两个相应的向量表示
char_num = len(init_weight_wv)
idx2word[char_num] = u'U'
word2idx[u'U'] = char_num
idx2word[char_num+1] = u' '
word2idx[u' '] = char_num+1

init_weight_wv.append(np.random.randn(100,))
init_weight_wv.append(np.zeros(100,))

In [21]:
# 分离word 和 label
with open(output_file) as f:
    lines = f.readlines()
    train_line = [[w[0] for w in line.decode('utf-8').split()] for line in lines]
    train_label = [w[2] for line in lines for w in line.decode('utf-8').split()]

In [17]:
# 文档转数字list
import numpy as np
def sent2num(sentence, word2idx = word2idx, context = 7):
    predict_word_num = []
    for w in sentence:
        # 文本中的字如果在词典中则转为数字，如果不在则设置为'U
        if w in word2idx:
            predict_word_num.append(word2idx[w])
        else:
            predict_word_num.append(word2idx[u'U'])
    # 首尾padding
    num = len(predict_word_num)
    pad = int((context-1)*0.5)
    for i in range(pad):
        predict_word_num.insert(0,word2idx[u' '] )
        predict_word_num.append(word2idx[u' '] )
    train_x = []
    for i in range(num):
        train_x.append(predict_word_num[i:i+context])
    return train_x

In [53]:
# 输入字符list，输出数字list
sent2num(train_line[0])

[[6988, 6988, 6988, 19, 11, 52, 223],
 [6988, 6988, 19, 11, 52, 223, 82],
 [6988, 19, 11, 52, 223, 82, 31],
 [19, 11, 52, 223, 82, 31, 275],
 [11, 52, 223, 82, 31, 275, 5],
 [52, 223, 82, 31, 275, 5, 4],
 [223, 82, 31, 275, 5, 4, 85],
 [82, 31, 275, 5, 4, 85, 192],
 [31, 275, 5, 4, 85, 192, 220],
 [275, 5, 4, 85, 192, 220, 410],
 [5, 4, 85, 192, 220, 410, 1],
 [4, 85, 192, 220, 410, 1, 79],
 [85, 192, 220, 410, 1, 79, 706],
 [192, 220, 410, 1, 79, 706, 107],
 [220, 410, 1, 79, 706, 107, 664],
 [410, 1, 79, 706, 107, 664, 2],
 [1, 79, 706, 107, 664, 2, 140],
 [79, 706, 107, 664, 2, 140, 430],
 [706, 107, 664, 2, 140, 430, 225],
 [107, 664, 2, 140, 430, 225, 5],
 [664, 2, 140, 430, 225, 5, 10],
 [2, 140, 430, 225, 5, 10, 56],
 [140, 430, 225, 5, 10, 56, 62],
 [430, 225, 5, 10, 56, 62, 86],
 [225, 5, 10, 56, 62, 86, 2],
 [5, 10, 56, 62, 86, 2, 192],
 [10, 56, 62, 86, 2, 192, 220],
 [56, 62, 86, 2, 192, 220, 410],
 [62, 86, 2, 192, 220, 410, 1],
 [86, 2, 192, 220, 410, 1, 324],
 [2, 192, 2

In [60]:
# 将所有训练文本转成数字list
train_word_num = []
for line in train_line:
    train_word_num.extend(sent2num(line))

In [62]:
print len(train_word_num)
print len(train_label)

4050469
4050469


In [64]:
dump(train_word_num, open('train_word_num.pickle', 'wb'))
#train_word_num = load(open('train_word_num.pickle','rb'))

In [22]:
nb_classes = len(np.unique(train_label))

- 步骤3：训练模型

In [65]:
from __future__ import absolute_import
from __future__ import print_function

from keras.preprocessing import sequence
from keras.optimizers import SGD, RMSprop, Adagrad
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Reshape, Flatten ,Dropout
from keras.regularizers import l1,l2
from keras.layers.convolutional import Convolution2D, MaxPooling2D,MaxPooling1D

In [23]:
# 建立两个字典
label_dict = dict(zip(np.unique(train_label), range(4)))
num_dict = {n:l  for l,n  in label_dict.iteritems()}
print(label_dict)
print(num_dict)
# 将目标变量转为数字
train_label = [label_dict[y] for y in train_label]

{u'M': 2, u'S': 3, u'B': 0, u'E': 1}
{0: u'B', 1: u'E', 2: u'M', 3: u'S'}


In [70]:
# 切分数据集
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(train_word_num, train_label , train_size=0.9, random_state=1)

In [71]:
Y_train = np_utils.to_categorical(train_y, nb_classes)
Y_test = np_utils.to_categorical(test_y, nb_classes)

In [72]:
print(len(train_X), 'train sequences')
print(len(test_X), 'test sequences')

3645422 train sequences
405047 test sequences


In [74]:
# 初始字向量格式准备
init_weight = [np.array(init_weight_wv)]

In [79]:
batch_size = 256

In [75]:
maxfeatures = init_weight[0].shape[0] # 词典大小

In [87]:
# 一个普通的单隐层神经网络，输入层700，隐藏层100，输出层4
# 迭代时同时更新神经网络权重，以及词向量
print('Build model...')
model = Sequential()
# 词向量初始化，输入维度：词典大小，输出维度：词向量100
model.add(Embedding(maxfeatures, 100,weights=init_weight)) # 使用初使词向量可以增加准确率
model.add(Flatten())
model.add(Dense(700, 100))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(100, nb_classes))
model.add(Activation('softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

Build model...


In [77]:
from keras.callbacks import Callback
class EarlyStopping(Callback):
    def __init__(self, patience=0, verbose=0):
        super(Callback, self).__init__()

        self.patience = patience
        self.verbose = verbose
        self.best_val_loss = np.Inf
        self.wait = 0

    def on_epoch_end(self, epoch, logs={}):
        if not self.params['do_validation']:
            warnings.warn("Early stopping requires validation data!", RuntimeWarning)

        cur_val_loss = logs.get('val_loss')
        if cur_val_loss < self.best_val_loss:
            self.best_val_loss = cur_val_loss
            self.wait = 0
        else:
            if self.wait >= self.patience:
                if self.verbose > 0:
                    print("Epoch %05d: early stopping" % (epoch))
                self.model.stop_training = True
            self.wait += 1

In [144]:
# train_X, test_X, Y_train, Y_test
print("Train...")
earlystop = EarlyStopping(patience=0, verbose=1)
result = model.fit(train_X, Y_train, batch_size=batch_size, nb_epoch=1, 
          validation_split=0.1, show_accuracy=True,callbacks=[earlystop])

Train...
Train on 3280879 samples, validate on 364543 samples
Epoch 0


In [145]:
score = earlystop.model.evaluate(test_X, Y_test, batch_size=batch_size)
print('Test score:', score)

Test score: 0.160891216656


In [146]:
# test数据集，准确率0.94
classes = earlystop.model.predict_classes(test_X, batch_size=batch_size)
acc = np_utils.accuracy(classes, test_y) # 要用没有转换前的y
print('Test accuracy:', acc)

Test accuracy: 0.948813347587


In [2]:
from cPickle import dump, load
#dump(model, open('model.pickle', 'wb'))
model = load(open('model.pickle','rb'))

- 步骤4：用test文本进行预测，评估效果

In [25]:
temp_txt = u'国家食药监总局发布通知称，酮康唑口服制剂因存在严重肝毒性不良反应，即日起停止生产销售使用。'
temp_txt = list(temp_txt)

In [18]:
temp_num = sent2num(temp_txt)
temp_num[:5]

[[6988, 6988, 6988, 16, 63, 609, 364],
 [6988, 6988, 16, 63, 609, 364, 731],
 [6988, 16, 63, 609, 364, 731, 254],
 [16, 63, 609, 364, 731, 254, 451],
 [63, 609, 364, 731, 254, 451, 50]]

In [24]:
# 根据输入得到标注推断
def predict_num(input_num,input_txt, \
                model = model,\
                label_dict=label_dict,\
                num_dict=num_dict):
    input_num = np.array(input_num)
    predict_prob = model.predict_proba(input_num)
    predict_lable = model.predict_classes(input_num)
    for i , lable in enumerate(predict_lable[:-1]):
        # 如果是首字 ，不可为E, M
        if i==0:
            predict_prob[i, label_dict[u'E']] = 0
            predict_prob[i, label_dict[u'M']] = 0      
        # 前字为B，后字不可为B,S
        if lable == label_dict[u'B']:
            predict_prob[i+1,label_dict[u'B']] = 0
            predict_prob[i+1,label_dict[u'S']] = 0
        # 前字为E，后字不可为M,E
        if lable == label_dict[u'E']:
            predict_prob[i+1,label_dict[u'M']] = 0
            predict_prob[i+1,label_dict[u'E']] = 0
        # 前字为M，后字不可为B,S
        if lable == label_dict[u'M']:
            predict_prob[i+1,label_dict[u'B']] = 0
            predict_prob[i+1,label_dict[u'S']] = 0
        # 前字为S，后字不可为M,E
        if lable == label_dict[u'S']:
            predict_prob[i+1,label_dict[u'M']] = 0
            predict_prob[i+1,label_dict[u'E']] = 0
        predict_lable[i+1] = predict_prob[i+1].argmax()
    predict_lable_new = [num_dict[x]  for x in predict_lable]
    result =  [w+'/' +l  for w, l in zip(input_txt,predict_lable_new)]
    return ' '.join(result) + '\n'

In [26]:
temp = predict_num(temp_num,temp_txt)
print(temp)

国/B 家/M 食/M 药/M 监/M 总/M 局/E 发/B 布/E 通/B 知/E 称/S ，/S 酮/S 康/S 唑/B 口/E 服/B 制/M 剂/E 因/S 存/B 在/E 严/B 重/E 肝/B 毒/E 性/S 不/B 良/E 反/B 应/E ，/S 即/B 日/E 起/S 停/B 止/E 生/B 产/E 销/B 售/E 使/B 用/E 。/S



In [27]:
test_file = 'icwb2-data/testing/msr_test.utf8'
with open(test_file,'r') as f:
    lines = f.readlines()
    test_texts = [list(line.decode('utf-8').strip()) for line in lines]

In [None]:
test_output = []
for line in test_texts:
    test_num = sent2num(line)
    output_line = predict_num(test_num,input_txt=line)
    test_output.append(output_line.encode('utf-8'))



In [31]:
with open('icwb2-data/testing/msr_test_output.utf8','w') as f:
    f.writelines(test_output)

In [32]:
input_file = 'icwb2-data/testing/msr_test_output.utf8'
output_file = 'icwb2-data/testing/msr_test.split.tag2word.utf8'

In [33]:
import codecs
import sys

def character_2_word(input_file, output_file):
    input_data = codecs.open(input_file, 'r', 'utf-8')
    output_data = codecs.open(output_file, 'w', 'utf-8')
    # 4 tags for character tagging: B(Begin), E(End), M(Middle), S(Single)
    for line in input_data.readlines():
        char_tag_list = line.strip().split()
        for char_tag in char_tag_list:
            char_tag_pair = char_tag.split('/')
            char = char_tag_pair[0]
            tag = char_tag_pair[1]
            if tag == 'B':
                output_data.write(' ' + char)
            elif tag == 'M':
                output_data.write(char)
            elif tag == 'E':
                output_data.write(char + ' ')
            else: # tag == 'S'
                output_data.write(' ' + char + ' ')
        output_data.write("\n")
    input_data.close()
    output_data.close()

character_2_word(input_file, output_file)

- 最终使用perl脚本检验的F值为0.913

In [35]:
! ./icwb2-data/scripts/score ./icwb2-data/gold/msr_training_words.utf8 ./icwb2-data/gold/msr_test_gold.utf8 msr_test.split.tag2word.utf8 > deep.score