# 基于平均感知器的分词器

## 原理

&emsp;&emsp;我们知道对于一句话进行分词，我们可以通过给每个字进行标注状态（B：分词词首；M：分词词中；E：分词词尾；S：单个词分词）来分词，然后去计算出每个字到底是什么状态，从而完成分词。

&emsp;&emsp;那使用感知器，无非也就是用感知器来预测出每个字的4种状态的概率分别是多少，然后利用维特比算法找出整个句子最大概率的状态路径，从而完成分词。和hmm分词差不多，只是将hmm预测隐状态概率换成了使用感知器的多分类来预测隐状态概率。

In [None]:
from collections import defaultdict
import pickle
import random


class AveragedPerceptron(object):

    '''An averaged perceptron, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        for feat, value in features.items():
            if feat not in self.weights or value == 0:
                continue
            weights = self.weights[feat]
            for label, weight in weights.items():
                scores[label] += value * weight
        # Do a secondary alphabetic sort, for stability
        return max(self.classes, key=lambda label: (scores[label], label))

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None


def train(nr_iter, examples):
    '''Return an averaged perceptron model trained on ``examples`` for
    ``nr_iter`` iterations.
    '''
    model = AveragedPerceptron()
    for i in range(nr_iter):
        random.shuffle(examples)
        for features, class_ in examples:
            scores = model.predict(features)
            guess, score = max(scores.items(), key=lambda i: i[1])
            if guess != class_:
                model.update(class_, guess, features)
    model.average_weights()
    return model

## 数据处理
1、对每个句子中的每个字，取7种特征【unigrams+bigrams】

> $C_{i-1}$ #前一个词 \
> $C_i$ #当前词 \
> $C_{i+1}$ #后一个词 \
> $C_{i-2}/C_{i-1}$ #前两个词与前一个词 \
> $C_{i-1}/C_i$ #前一个词与当前词 \
> $C_i/C_{i+1}$  #当前词与后一个词 \
> $C_{i+1}/C_{i+2}$  #后一个词与后两个词


In [3]:
import numpy as np
all_feature = {'BL=B':0, 'BL=M':1, 'BL=E':2, 'BL=S':3, 'BL=_BL_':4}  # 特征取值集合
sentence_feature_list = []  # 记录每个句子中每个字的特征
sentence_tag_list = []  # 每个字的类标
START_CHAR = '\1'
END_CHAR = '\2'

count = 0

def handle_feature(feature, char_feature, all_feature):
    feature_id = all_feature[feature] if feature in all_feature else len(all_feature) + 1
    all_feature[feature] = feature_id
    char_feature.append(feature_id)

# weights = np.ones(())
data_f = open('./data/RenMinData.txt', 'r', encoding='utf-8')
for line in data_f.readlines():
    count += 1
    if count > 3: break
    line = line.strip()
    # 打标签
    words = line.split(' ')
    
    for word in words:
        tag_list = []
        if len(word) == 1:
            tag_list.append(all_feature['BL=S'])
        else:
            tag_list.append(all_feature['BL=S'])
            for w in word[1:len(word)-1]: # 中间字
                tag_list.append(all_feature['BL=M'])
            tag_list.append(all_feature['BL=E'])
    sentence_tag_list.append(tag_list)
    
    # 获取特征    
    sentence = line.replace(' ','')
    sentence_feature = []
    for i, char in enumerate(sentence):
        char_feature = []
        # 前2
        pre2 = sentence[i-2] if i>=2 else START_CHAR
        # 前1
        pre1 = sentence[i-1] if i>=1 else START_CHAR
        # 当前
        cur = char
        # 后1
        next1 = sentence[i+1] if i < len(sentence)-1 else END_CHAR
        # 后2
        next2 = sentence[i+2] if i < len(sentence)-2 else END_CHAR
        
        # unigrams
        one = pre1+'1'
        handle_feature(one, char_feature, all_feature)
        
        two = cur+'2'
        handle_feature(two, char_feature, all_feature)
        
        three = cur+'2'
        handle_feature(three, char_feature, all_feature)
        
        # bigrams
        four = pre2+'/'+pre1+'4'
        handle_feature(four, char_feature, all_feature)
        
        five = pre1+'/'+cur+'5'
        handle_feature(five, char_feature, all_feature)
        
        six = cur+'/'+next1+'6'
        handle_feature(six, char_feature, all_feature)
        
        seven = next1+'/'+next2+'7'
        handle_feature(seven, char_feature, all_feature)
        
        sentence_feature.append(char_feature)
        print('char_feature',char_feature)
    sentence_feature_list.append(sentence_feature)     
print(sentence_feature_list[:3])   

char_feature [6, 7, 7, 8, 9, 10, 11]
char_feature [12, 13, 13, 14, 15, 16, 17]
char_feature [18, 19, 19, 20, 21, 22, 23]
char_feature [24, 25, 25, 26, 27, 28, 29]
char_feature [30, 31, 31, 32, 33, 34, 35]
char_feature [36, 37, 37, 38, 39, 40, 41]
char_feature [6, 42, 42, 8, 43, 44, 45]
char_feature [46, 47, 47, 48, 49, 50, 51]
char_feature [52, 53, 53, 54, 55, 56, 57]
char_feature [58, 59, 59, 60, 61, 62, 63]
char_feature [64, 65, 65, 66, 67, 68, 69]
char_feature [70, 71, 71, 72, 73, 74, 75]
char_feature [76, 77, 77, 78, 79, 80, 81]
char_feature [82, 83, 83, 84, 85, 86, 87]
char_feature [88, 89, 89, 90, 91, 92, 93]
char_feature [94, 95, 95, 96, 97, 98, 99]
char_feature [100, 101, 101, 102, 103, 104, 105]
char_feature [106, 107, 107, 108, 109, 110, 111]
char_feature [112, 113, 113, 114, 115, 116, 41]
char_feature [6, 117, 117, 8, 118, 119, 120]
char_feature [121, 122, 122, 123, 124, 125, 126]
char_feature [127, 95, 95, 128, 129, 130, 131]
char_feature [100, 132, 132, 133, 134, 135, 29]
