# 基于平均感知器的分词器

## 原理

&emsp;&emsp;我们知道对于一句话进行分词，我们可以通过给每个字进行标注状态（B：分词词首；M：分词词中；E：分词词尾；S：单个词分词）来分词，然后去计算出每个字到底是什么状态，从而完成分词。

&emsp;&emsp;那使用感知器，无非也就是用感知器来预测出每个字的4种状态的概率分别是多少，然后利用维特比算法找出整个句子最大概率的状态路径，从而完成分词。和hmm分词差不多，选取的是hmm里的特征，每个特征进行权重训练，而不像hmm直接计算得来。

In [None]:
from collections import defaultdict
import pickle
import random


class AveragedPerceptron(object):

    '''An averaged perceptron, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        for feat, value in features.items():
            if feat not in self.weights or value == 0:
                continue
            weights = self.weights[feat]
            for label, weight in weights.items():
                scores[label] += value * weight
        # Do a secondary alphabetic sort, for stability
        return max(self.classes, key=lambda label: (scores[label], label))

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None


def train(nr_iter, examples):
    '''Return an averaged perceptron model trained on ``examples`` for
    ``nr_iter`` iterations.
    '''
    model = AveragedPerceptron()
    for i in range(nr_iter):
        random.shuffle(examples)
        for features, class_ in examples:
            scores = model.predict(features)
            guess, score = max(scores.items(), key=lambda i: i[1])
            if guess != class_:
                model.update(class_, guess, features)
    model.average_weights()
    return model

## 数据处理
1、对每个句子中的每个字，取7种特征【unigrams+bigrams】

> $C_{i-1}$ #前一个词 \
> $C_i$ #当前词 \
> $C_{i+1}$ #后一个词 \
> $C_{i-2}/C_{i-1}$ #前两个词与前一个词 \
> $C_{i-1}/C_i$ #前一个词与当前词 \
> $C_i/C_{i+1}$  #当前词与后一个词 \
> $C_{i+1}/C_{i+2}$  #后一个词与后两个词

注意：还需要添加第8个特征即前一个字的类标

In [26]:
import numpy as np
all_feature = {'BL=B':0, 'BL=M':1, 'BL=E':2, 'BL=S':3, 'BL=_BL_':4}  # 特征取值集合
sentence_feature_dict = {}  # 记录每个句子中每个字的特征，key是句子，value是特征list
sentence_tag_dict = {}  # 每个字的类标，，key是句子，value是类标list
all_sentences = []  # 所有句子
START_CHAR = '\1'
END_CHAR = '\2'

count = 0

def handle_feature(feature, char_feature, all_feature):
    feature_id = all_feature[feature] if feature in all_feature else len(all_feature)
    all_feature[feature] = feature_id
    char_feature.append(feature_id)

# weights = np.ones(())
data_f = open('./data/RenMinData.txt', 'r', encoding='utf-8')
for line in data_f.readlines():
    count += 1
    line = line.strip()
    # 打标签
    words = line.split(' ')
    all_sentences.append(''.join(words))
    tag_list = []
    for word in words:
        if len(word) == 1:
            tag_list.append(all_feature['BL=S'])
        else:
            tag_list.append(all_feature['BL=B'])
            for w in word[1:len(word)-1]: # 中间字
                tag_list.append(all_feature['BL=M'])
            tag_list.append(all_feature['BL=E'])
    sentence_tag_dict[''.join(words)] = tag_list
    
    # 获取特征    
    sentence = line.replace(' ','')
    sentence_feature = []
    for i, char in enumerate(sentence):
        char_feature = []
        # 前2
        pre2 = sentence[i-2] if i>=2 else START_CHAR
        # 前1
        pre1 = sentence[i-1] if i>=1 else START_CHAR
        # 当前
        cur = char
        # 后1
        next1 = sentence[i+1] if i < len(sentence)-1 else END_CHAR
        # 后2
        next2 = sentence[i+2] if i < len(sentence)-2 else END_CHAR
        
        # unigrams
        one = pre1+'1'
        handle_feature(one, char_feature, all_feature)
        
        two = cur+'2'
        handle_feature(two, char_feature, all_feature)
        
        three = next1+'3'
        handle_feature(three, char_feature, all_feature)
        
        # bigrams
        four = pre2+'/'+pre1+'4'
        handle_feature(four, char_feature, all_feature)
        
        five = pre1+'/'+cur+'5'
        handle_feature(five, char_feature, all_feature)
        
        six = cur+'/'+next1+'6'
        handle_feature(six, char_feature, all_feature)
        
        seven = next1+'/'+next2+'7'
        handle_feature(seven, char_feature, all_feature)
        
        sentence_feature.append(char_feature)
        #print('char_feature',char_feature)
    sentence_feature_dict[''.join(words)] = sentence_feature     
print('数据集处理完成！')   

数据集处理完成！


## 训练

In [27]:
import random
import time
import numpy as np
tag_list = [0,1,2,3]  # 标签id
hidden_node_num = len(tag_list)  # 隐状态个数（标签个数）
# 参数
global W
# W = np.random.randn(hidden_node_num, len(all_feature))
W = np.zeros((hidden_node_num, len(all_feature)))
             
def viterbi_decode(sentence_feature, W):
    path_prob = []  # 路径和该路径的概率
    pre_X = []
    best_path = []  # 最好的路径，也就是最好的tag
    for i, char_feature in enumerate(sentence_feature):
        
        if i == 0:
            char_feature.append(4)  # 前一个字的标签
            Z = [sum(W[tag][char_feature]) for tag in tag_list]   # 每个tag的概率【因为是第一个字所以只用到发射概率和初始概率】
            # 第一个tag是当前字的标签，第二个tag是上一个字的标签
            path_prob.append({tag:(tag, prob) for tag, prob in enumerate(Z)})
        else:
            Z = [sum(W[tag][char_feature]) for tag in tag_list]   # 每个tag的概率
            res = {}
            for j, prob in enumerate(Z):
                # 加上上一个字的标签分别为0，1，2，3时的概率，然后求最大【发射概率、转移概率、初始概率】
                max_prob, pre_tag =  max([(prob + W[j][pre_tag] + value[1], pre_tag) for pre_tag, value in path_prob[i-1].items()])
                res[j] = (pre_tag, max_prob)
            path_prob.append(res)
    #print('path_prob:',path_prob)
    # 确定最后一个字的tag，然后回溯，确定路径（确定每个字的tag）
    last_tag = -1
    max_prob = -100000
    for tag in tag_list:
        prob = path_prob[-1][tag][1]
        if max_prob < prob:
            last_tag = tag
    #last_tag = sorted(, key=lambda x: x[1][1], reverse = True)[0][0]
    #print('last_tag:', last_tag)
    # last_tag = np.argmax([pp[1][1] for pp in path_prob[-1]])
    best_path.append(last_tag)
    for pp in reversed(path_prob):
        best_path.append(pp[last_tag][0])
        last_tag = pp[last_tag][0]
    best_path.pop()
    best_path.reverse()
    #print('best_path:', best_path)
    return best_path
                
def precision_and_recall(predict_sentence_tags):
    correct_count = 0
    for sentence, predict_sentence_tag in predict_sentence_tags:
        if predict_sentence_tag == sentence_tag_dict[sentence]:
            correct_count += 1
    return correct_count / len(predict_sentence_tags)
    
    
    
def precision_and_recall2(predict_sentence_tags):
    correct_count = 0
    total_count = 0
    for sentence, predict_sentence_tag in predict_sentence_tags:
        for i, tag in enumerate(predict_sentence_tag):
            total_count +=1
            if tag == sentence_tag_dict[sentence][i]:
                correct_count += 1
    return correct_count / total_count
    

# 训练
def train(all_sentences, maxIteration, ratio = 0.7):
    global W
    end = int(np.floor(ratio*len(all_sentences)))
    print(end)
    x_train = all_sentences[:end]
    x_val = all_sentences[end:]
    for iter in range(maxIteration):
        random.shuffle(x_train)  # 打乱
        start = time.time()
        for i, sentence in enumerate(x_train):
            sentence_feature = sentence_feature_dict[sentence]
#             print('i--', i)
            if i % 10000 == 0:
#                 print('10000次耗时：', time.time()-start)
                start = time.time()
                print('i--', i)
#             start = time.time()
            predict_tag = viterbi_decode(sentence_feature, W)
#             print('viterbi_decode耗时：', time.time()-start)
            actual_tag = sentence_tag_dict[sentence]
#             print('predict_tag:', predict_tag)
#             print('actual_tag:', actual_tag)
            if predict_tag == actual_tag:
                #print('true')
                continue
            else:
                # 更新权重：每个特征上的实际标签权重增加(+1)，其他标签权重减少(-1)
#                 start = time.time()
                for j, char_feature in enumerate(sentence_feature):
#                     print('predict_tag:', predict_tag)
#                     print('actual_tag:', actual_tag)
                    if predict_tag[j] == actual_tag[j]:
                        continue
#                     print('char_feature:',char_feature)
                    for tag in tag_list:
                        if tag == actual_tag[j]:
                            #print('W[tag][char_feature] == ', W[tag][char_feature])
                            W[tag][char_feature] += 1
                            if j == 0:  # 第一个字
                                W[tag][4] += 1
                            else:
                                W[tag][actual_tag[j-1]] += 1
                            #print('W[tag][char_feature] ++1 == ', W[tag][char_feature])
                        elif tag == predict_tag[j]:
                            W[tag][char_feature] -= 1
                            if j == 0:  # 第一个字
                                W[tag][4] -= 1
                            else:
                                W[tag][predict_tag[j-1]] -= 1
#                 print('W耗时：', time.time()-start)

        # 本轮次训练集和验证集上的结果
        train_predict_sentence_tags = [(sentence, viterbi_decode(sentence_feature_dict[sentence], W)) for sentence in x_train[:10000]]
        print('训练集准确率：', precision_and_recall2(train_predict_sentence_tags))
        val_predict_sentence_tags = [(sentence, viterbi_decode(sentence_feature_dict[sentence], W)) for sentence in x_val]
        print('验证集准确率：', precision_and_recall2(val_predict_sentence_tags))

print('总共句子数：', len(all_sentences))
train(all_sentences, 2, ratio = 0.8)

总共句子数： 297959
238367
i-- 0
i-- 10000
i-- 20000
i-- 30000
i-- 40000
i-- 50000
i-- 60000
i-- 70000
i-- 80000
i-- 90000
i-- 100000
i-- 110000
i-- 120000
i-- 130000
i-- 140000
i-- 150000
i-- 160000
i-- 170000
i-- 180000
i-- 190000
i-- 200000
i-- 210000
i-- 220000
i-- 230000
训练集准确率： 0.9675375423865655
验证集准确率： 0.959462607158562
i-- 0
i-- 10000
i-- 20000
i-- 30000
i-- 40000
i-- 50000
i-- 60000
i-- 70000
i-- 80000
i-- 90000
i-- 100000
i-- 110000
i-- 120000
i-- 130000
i-- 140000
i-- 150000
i-- 160000
i-- 170000
i-- 180000
i-- 190000
i-- 200000
i-- 210000
i-- 220000
i-- 230000
训练集准确率： 0.9746846766258163
验证集准确率： 0.9648003988105345


## 预测

&emsp;&emsp;对于没有出现在训练集里的特征如何处理

In [28]:
# 获取句子特征
def get_sentence_features(sentence):
    print(sentence)
    sentence_features = []
    for i, char in enumerate(sentence):
        char_feature = []
        # 前2
        pre2 = sentence[i-2] if i>=2 else START_CHAR
        # 前1
        pre1 = sentence[i-1] if i>=1 else START_CHAR
        # 当前
        cur = char
        # 后1
        next1 = sentence[i+1] if i < len(sentence)-1 else END_CHAR
        # 后2
        next2 = sentence[i+2] if i < len(sentence)-2 else END_CHAR

        # unigrams
        one = pre1+'1'
        if one in all_feature:
            char_feature.append(all_feature[one])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% one)

        two = cur+'2'
        if two in all_feature:
            char_feature.append(all_feature[two])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% two)

        three = next1+'3'
        if three in all_feature:
            char_feature.append(all_feature[three])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% three)

        # bigrams
        four = pre2+'/'+pre1+'4'
        if four in all_feature:
            char_feature.append(all_feature[four])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% four)

        five = pre1+'/'+cur+'5'
        if five in all_feature:
            char_feature.append(all_feature[five])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% five)

        six = cur+'/'+next1+'6'
        if six in all_feature:
            char_feature.append(all_feature[six])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% six)

        seven = next1+'/'+next2+'7'
        if seven in all_feature:
            char_feature.append(all_feature[seven])
        else:
            print('%s--特征没有在模型的特征中，权重设为0'% seven)
        sentence_features.append(char_feature)
    return sentence_features

# 获取句子分词结果
def get_sentence_seg_result(sentence, predict_tag):
    seg_list = []
    word = ''
    for i, tag in enumerate(predict_tag):
        if tag == 'B':
            seg_list.append(word)
            word = ''
        
# 预测
sentence = '此外 还要 定期 到 全国 各省 去 访问 。  '.replace(' ','')
sentence_feature = get_sentence_features(sentence)
predict_tag = viterbi_decode(sentence_feature, W)
print(sentence)
print(predict_tag)
print(sentence_tag_dict[sentence])

此外还要定期到全国各省去访问。
此外还要定期到全国各省去访问。
[1, 2, 0, 2, 0, 2, 3, 0, 2, 0, 2, 3, 0, 2, 3]
[0, 2, 0, 2, 0, 2, 3, 0, 2, 0, 2, 3, 0, 2, 3]


## 训练速度提升