# 基于平均感知器的分词器

## 原理

&emsp;&emsp;我们知道对于一句话进行分词，我们可以通过给每个字进行标注状态（B：分词词首；M：分词词中；E：分词词尾；S：单个词分词）来分词，然后去计算出每个字到底是什么状态，从而完成分词。

&emsp;&emsp;那使用感知器，无非也就是用感知器来预测出每个字的4种状态的概率分别是多少，然后利用维特比算法找出整个句子最大概率的状态路径，从而完成分词。和hmm分词差不多，只是将hmm预测隐状态概率换成了使用感知器的多分类来预测隐状态概率。

In [None]:
from collections import defaultdict
import pickle
import random


class AveragedPerceptron(object):

    '''An averaged perceptron, as implemented by Matthew Honnibal.
    See more implementation details here:
        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
    '''

    def __init__(self):
        # Each feature gets its own weight vector, so weights is a dict-of-dicts
        self.weights = {}
        self.classes = set()
        # The accumulated values, for the averaging. These will be keyed by
        # feature/clas tuples
        self._totals = defaultdict(int)
        # The last time the feature was changed, for the averaging. Also
        # keyed by feature/clas tuples
        # (tstamps is short for timestamps)
        self._tstamps = defaultdict(int)
        # Number of instances seen
        self.i = 0

    def predict(self, features):
        '''Dot-product the features and current weights and return the best label.'''
        scores = defaultdict(float)
        for feat, value in features.items():
            if feat not in self.weights or value == 0:
                continue
            weights = self.weights[feat]
            for label, weight in weights.items():
                scores[label] += value * weight
        # Do a secondary alphabetic sort, for stability
        return max(self.classes, key=lambda label: (scores[label], label))

    def update(self, truth, guess, features):
        '''Update the feature weights.'''
        def upd_feat(c, f, w, v):
            param = (f, c)
            self._totals[param] += (self.i - self._tstamps[param]) * w
            self._tstamps[param] = self.i
            self.weights[f][c] = w + v

        self.i += 1
        if truth == guess:
            return None
        for f in features:
            weights = self.weights.setdefault(f, {})
            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
        return None

    def average_weights(self):
        '''Average weights from all iterations.'''
        for feat, weights in self.weights.items():
            new_feat_weights = {}
            for clas, weight in weights.items():
                param = (feat, clas)
                total = self._totals[param]
                total += (self.i - self._tstamps[param]) * weight
                averaged = round(total / float(self.i), 3)
                if averaged:
                    new_feat_weights[clas] = averaged
            self.weights[feat] = new_feat_weights
        return None

    def save(self, path):
        '''Save the pickled model weights.'''
        return pickle.dump(dict(self.weights), open(path, 'w'))

    def load(self, path):
        '''Load the pickled model weights.'''
        self.weights = pickle.load(open(path))
        return None


def train(nr_iter, examples):
    '''Return an averaged perceptron model trained on ``examples`` for
    ``nr_iter`` iterations.
    '''
    model = AveragedPerceptron()
    for i in range(nr_iter):
        random.shuffle(examples)
        for features, class_ in examples:
            scores = model.predict(features)
            guess, score = max(scores.items(), key=lambda i: i[1])
            if guess != class_:
                model.update(class_, guess, features)
    model.average_weights()
    return model

## 数据处理
1、对每个句子中的每个字，取7种特征【unigrams+bigrams】

> $C_{i-1}$ #前一个词 \
> $C_i$ #当前词 \
> $C_{i+1}$ #后一个词 \
> $C_{i-2}/C_{i-1}$ #前两个词与前一个词 \
> $C_{i-1}/C_i$ #前一个词与当前词 \
> $C_i/C_{i+1}$  #当前词与后一个词 \
> $C_{i+1}/C_{i+2}$  #后一个词与后两个词


In [112]:
import numpy as np
all_feature = {'BL=B':0, 'BL=M':1, 'BL=E':2, 'BL=S':3, 'BL=_BL_':4}  # 特征取值集合
sentence_feature_list = []  # 记录每个句子中每个字的特征
sentence_tag_list = []  # 每个字的类标
START_CHAR = '\1'
END_CHAR = '\2'

count = 0

def handle_feature(feature, char_feature, all_feature):
    feature_id = all_feature[feature] if feature in all_feature else len(all_feature)
    all_feature[feature] = feature_id
    char_feature.append(feature_id)

# weights = np.ones(())
data_f = open('./data/RenMinData.txt', 'r', encoding='utf-8')
for line in data_f.readlines():
    count += 1
    line = line.strip()
    # 打标签
    words = line.split(' ')
    tag_list = []
    for word in words:
        if len(word) == 1:
            tag_list.append(all_feature['BL=S'])
        else:
            tag_list.append(all_feature['BL=S'])
            for w in word[1:len(word)-1]: # 中间字
                tag_list.append(all_feature['BL=M'])
            tag_list.append(all_feature['BL=E'])
    sentence_tag_list.append(tag_list)
    
    # 获取特征    
    sentence = line.replace(' ','')
    sentence_feature = []
    for i, char in enumerate(sentence):
        char_feature = []
        # 前2
        pre2 = sentence[i-2] if i>=2 else START_CHAR
        # 前1
        pre1 = sentence[i-1] if i>=1 else START_CHAR
        # 当前
        cur = char
        # 后1
        next1 = sentence[i+1] if i < len(sentence)-1 else END_CHAR
        # 后2
        next2 = sentence[i+2] if i < len(sentence)-2 else END_CHAR
        
        # unigrams
        one = pre1+'1'
        handle_feature(one, char_feature, all_feature)
        
        two = cur+'2'
        handle_feature(two, char_feature, all_feature)
        
        three = next1+'3'
        handle_feature(three, char_feature, all_feature)
        
        # bigrams
        four = pre2+'/'+pre1+'4'
        handle_feature(four, char_feature, all_feature)
        
        five = pre1+'/'+cur+'5'
        handle_feature(five, char_feature, all_feature)
        
        six = cur+'/'+next1+'6'
        handle_feature(six, char_feature, all_feature)
        
        seven = next1+'/'+next2+'7'
        handle_feature(seven, char_feature, all_feature)
        
        sentence_feature.append(char_feature)
        #print('char_feature',char_feature)
    sentence_feature_list.append(sentence_feature)     
print(sentence_tag_list[:3])   

[[3, 1, 1, 1, 2, 3], [3, 2, 3, 2, 3, 2, 3, 2, 3, 3, 3, 3, 3], [3, 2, 3, 3, 3, 3]]


In [None]:
a=np.array([[0,[1,2,3]],[1,[1,2,3]]])
b = [0,1]
print(sum([a[0][b] for ]))

In [81]:
a = [1,2]
b = [1,2]
c = [3,3]
print(a==b)

False


In [85]:
print(max([('a',2),('b',4),('c',2)]))

('c', 2)


In [114]:
import numpy as np
tag_list = [0,1,2,3]  # 标签id
hidden_node_num = len(tag_list)  # 隐状态个数（标签个数）
# 参数
global W
# W = np.random.randn(hidden_node_num, len(all_feature))
W = np.zeros((hidden_node_num, len(all_feature)))
             
def viterbi_decode(sentence_feature, W):
    path_prob = []  # 路径和该路径的概率
    pre_X = []
    best_path = []  # 最好的路径，也就是最好的tag
    for i, char_feature in enumerate(sentence_feature):
        
        if i == 0:
            char_feature.append(4)  # 前一个字的标签
            Z = [sum(W[tag][char_feature]) for tag in tag_list]   # 每个tag的概率
            # 第一个tag是当前字的标签，第二个tag是上一个字的标签
            path_prob.append({tag:(tag, prob) for tag, prob in enumerate(Z)})
        else:
            Z = [sum(W[tag][char_feature]) for tag in tag_list]   # 每个tag的概率
            res = {}
            for j, prob in enumerate(Z):
                # 加上上一个字的标签分别为0，1，2，3时的概率，然后求最大
                max_prob, pre_tag =  max([(prob + W[j][pre_tag] + value[1], pre_tag) for pre_tag, value in path_prob[i-1].items()])
                res[j] = (pre_tag, max_prob)
            path_prob.append(res)
    #print('path_prob:',path_prob)
    # 确定最后一个字的tag，然后回溯，确定路径（确定每个字的tag）
    last_tag = -1
    max_prob = -100000
    for tag in tag_list:
        prob = path_prob[-1][tag][1]
        if max_prob < prob:
            last_tag = tag
    #last_tag = sorted(, key=lambda x: x[1][1], reverse = True)[0][0]
    #print('last_tag:', last_tag)
    # last_tag = np.argmax([pp[1][1] for pp in path_prob[-1]])
    best_path.append(last_tag)
    for pp in reversed(path_prob):
        best_path.append(pp[last_tag][0])
        last_tag = pp[last_tag][0]
    best_path.pop()
    best_path.reverse()
    #print('best_path:', best_path)
    return best_path
                

def precision_and_recall(predict_sentence_tags, actual_sentence_tags):
    correct_count = 0
    for i in range(len(actual_sentence_tags)):
        #print('predict_sentence_tags[i]:', predict_sentence_tags[i])
        #print('actual_sentence_tags[i]:', actual_sentence_tags[i])
        if predict_sentence_tags[i] == actual_sentence_tags[i]:
            correct_count += 1
    return correct_count / len(actual_sentence_tags)
    

import time
# 训练
def train(sentence_feature_list, sentence_tag_list, maxIteration, ratio = 0.7):
    global W
    print(len(sentence_feature_list))
    end = int(np.floor(ratio*len(sentence_feature_list)))
    print(end)
    x_train = sentence_feature_list[:end]
    x_val = sentence_feature_list[end:]
    y_train = sentence_tag_list[:end]
    y_val = sentence_tag_list[end:]
    for iter in range(maxIteration):
        start = time.time()
        for i, sentence_feature in enumerate(x_train):
#             print('i--', i)
            if i % 10000 == 0:
#                 print('10000次耗时：', time.time()-start)
                start = time.time()
                print('i--', i)
#             start = time.time()
            predict_tag = viterbi_decode(sentence_feature, W)
#             print('viterbi_decode耗时：', time.time()-start)
            actual_tag = y_train[i]
#             print('predict_tag:', predict_tag)
#             print('actual_tag:', actual_tag)
            if predict_tag == actual_tag:
                #print('true')
                continue
            else:
                # 更新权重：每个特征上的实际标签权重增加(+1)，其他标签权重减少(-1)
#                 start = time.time()
                for j, char_feature in enumerate(sentence_feature):
#                     print('predict_tag:', predict_tag)
#                     print('actual_tag:', actual_tag)
                    if predict_tag[j] == actual_tag[j]:
                        continue
#                     print('char_feature:',char_feature)
                    for tag in tag_list:
                        if tag == actual_tag[j]:
                            #print('W[tag][char_feature] == ', W[tag][char_feature])
                            W[tag][char_feature] += 1
                            if j == 0:  # 第一个字
                                W[tag][4] += 1
                            else:
                                W[tag][actual_tag[j-1]] += 1
                            #print('W[tag][char_feature] ++1 == ', W[tag][char_feature])
                        elif tag == predict_tag[j]:
                            W[tag][char_feature] -= 1
                            if j == 0:  # 第一个字
                                W[tag][4] -= 1
                            else:
                                W[tag][predict_tag[j-1]] -= 1
#                 print('W耗时：', time.time()-start)

        # 本轮次训练集和验证集上的结果
        train_predict_sentence_tags = [viterbi_decode(sentence_feature, W) for sentence_feature in x_train[:100]]
        print('训练集准确率：', precision_and_recall(train_predict_sentence_tags[:100], y_train[:100]))
        val_predict_sentence_tags = [viterbi_decode(sentence_feature, W) for sentence_feature in x_val]
        print('验证集准确率：', precision_and_recall(val_predict_sentence_tags, y_val))

print(len(sentence_feature_list))  
result_dict = dict(zip(sentence_feature_list, sentence_tag_list))
import random
random.shuffle(result_dict)
sentence_feature_list_shuffle = result_dict.keys()
sentence_tag_list_shuffle = result_dict.values()
sentence_feature_list2 = sentence_feature_list_shuffle[:40000]
sentence_tag_list2 = sentence_tag_list_shuffle[:40000]

train(sentence_feature_list_shuffle, sentence_tag_list_shuffle, 3, ratio = 0.8)

297959


TypeError: unhashable type: 'list'

In [110]:
import random 
print(sentence_feature_list[10])
print(sentence_tag_list[10])
random.seed(1)
random.shuffle(sentence_feature_list)
random.shuffle(sentence_tag_list)
print(sentence_feature_list[10])
print(sentence_tag_list[10])

[[5, 193, 194, 8, 793, 197, 794, 4, 4, 4, 4, 4, 4], [199, 200, 629, 795, 203, 796, 797], [206, 635, 798, 208, 799, 800, 801], [640, 802, 803, 804, 805, 806, 807], [808, 809, 810, 811, 812, 813, 814], [815, 816, 276, 817, 818, 819, 820], [821, 282, 822, 823, 824, 825, 826], [286, 827, 364, 828, 829, 830, 831], [832, 370, 833, 834, 835, 836, 837], [376, 838, 124, 839, 840, 841, 128], [842, 130, 42, 843, 844, 133, 46]]
[3, 2, 3, 2, 3, 2, 3, 2, 3, 2, 3]
[[5, 1127, 13765, 8, 11196, 178760, 663255, 4, 4, 4], [1133, 13769, 1875, 11198, 178761, 663256, 782791], [13772, 1880, 25248, 178762, 663257, 782793, 25252], [1886, 25254, 16351, 663259, 782795, 25257, 132857], [25259, 16356, 35, 782796, 25261, 132859, 39], [16362, 41, 42, 25263, 132860, 45, 46]]
[3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, 2, 3, 2, 3]


In [41]:
print(max(all_feature.values()))

1709877


In [97]:
s = time.time()
print(W[1][1])
e = time.time()
print(e-s)

-1.0
0.0004749298095703125


In [86]:
s = time.time()
print(W.take([[1,1],[1,3],[1,5],[1,7],[1,9],[1,44],[1,445]]))
e = time.time()
print(e-s)

[[-1. -1.]
 [-1. -2.]
 [-1.  0.]
 [-1.  0.]
 [-1.  0.]
 [-1.  0.]
 [-1.  0.]]
0.0006079673767089844


In [96]:
arr = [[0 for x in range(len(all_feature))] for y in range(hidden_node_num)]
s = time.time()
print(arr[1][1])
e = time.time()
print(e-s)
print(np.array(arr).shape)

0
0.0002579689025878906
(4, 1709878)


In [102]:
import numpy as np
tag_list = [0,1,2,3]  # 标签id
hidden_node_num = len(tag_list)  # 隐状态个数（标签个数）
# 参数
global W
W = [0 for i in range(hidden_node_num*len(all_feature))]
             
def viterbi_decode(sentence_feature, W):
    path_prob = []  # 路径和该路径的概率
    pre_X = []
    best_path = []  # 最好的路径，也就是最好的tag
    for i, char_feature in enumerate(sentence_feature):
        
        if i == 0:
            char_feature.append(4)  # 前一个字的标签
            Z = [0 for i in range(4)]   # 每个tag的概率
            for tag in tag_list:
                for feature in char_feature:
                    Z[tag] += W[tag*feature]
            # 第一个tag是当前字的标签，第二个tag是上一个字的标签
            path_prob.append({tag:(tag, prob) for tag, prob in enumerate(Z)})
        else:
            Z = [0 for i in range(4)]   # 每个tag的概率
            for tag in tag_list:
                for feature in char_feature:
                    Z[tag] += W[tag*feature]
            res = {}
            for j, prob in enumerate(Z):
                # 加上上一个字的标签分别为0，1，2，3时的概率，然后求最大
                max_prob, pre_tag =  max([(prob + W[j*pre_tag] + value[1], pre_tag) for pre_tag, value in path_prob[i-1].items()])
                res[j] = (pre_tag, max_prob)
            path_prob.append(res)
    #print('path_prob:',path_prob)
    # 确定最后一个字的tag，然后回溯，确定路径（确定每个字的tag）
    last_tag = -1
    max_prob = -100000
    for tag in tag_list:
        prob = path_prob[-1][tag][1]
        if max_prob < prob:
            last_tag = tag
    #last_tag = sorted(, key=lambda x: x[1][1], reverse = True)[0][0]
    #print('last_tag:', last_tag)
    # last_tag = np.argmax([pp[1][1] for pp in path_prob[-1]])
    best_path.append(last_tag)
    for pp in reversed(path_prob):
        best_path.append(pp[last_tag][0])
        last_tag = pp[last_tag][0]
    best_path.pop()
    best_path.reverse()
    #print('best_path:', best_path)
    return best_path
                

def precision_and_recall(predict_sentence_tags, actual_sentence_tags):
    correct_count = 0
    for i in range(len(actual_sentence_tags)):
        #print('predict_sentence_tags[i]:', predict_sentence_tags[i])
        #print('actual_sentence_tags[i]:', actual_sentence_tags[i])
        if predict_sentence_tags[i] == actual_sentence_tags[i]:
            correct_count += 1
    return correct_count / len(actual_sentence_tags)
    

import time
# 训练
def train(sentence_feature_list, sentence_tag_list, maxIteration, ratio = 0.7):
    global W
    print(len(sentence_feature_list))
    end = int(np.floor(ratio*len(sentence_feature_list)))
    print(end)
    x_train = sentence_feature_list[:end]
    x_val = sentence_feature_list[end:]
    y_train = sentence_tag_list[:end]
    y_val = sentence_tag_list[end:]
    for iter in range(maxIteration):
        start = time.time()
        for i, sentence_feature in enumerate(x_train):
#             print('i--', i)
            if i % 10000 == 0:
                print('10000次耗时：', time.time()-start)
                start = time.time()
                print('i--', i)
#             start = time.time()
            predict_tag = viterbi_decode(sentence_feature, W)
#             print('viterbi_decode耗时：', time.time()-start)
            actual_tag = y_train[i]
#             print('predict_tag:', predict_tag)
#             print('actual_tag:', actual_tag)
            if predict_tag == actual_tag:
                #print('true')
                continue
            else:
                # 更新权重：每个特征上的实际标签权重增加(+1)，其他标签权重减少(-1)
#                 start = time.time()
                for j, char_feature in enumerate(sentence_feature):
#                     print('predict_tag:', predict_tag)
#                     print('actual_tag:', actual_tag)
                    if predict_tag[j] == actual_tag[j]:
                        continue
#                     print('char_feature:',char_feature)
                    for tag in tag_list:
                        if tag == actual_tag[j]:
                            #print('W[tag][char_feature] == ', W[tag][char_feature])
                            for feature in char_feature:
                                W[tag*feature] += 1
                            if j == 0:
                                W[tag*4] += 1
                            else:
                                W[tag*predict_tag[j-1]] += 1
                            #print('W[tag][char_feature] ++1 == ', W[tag][char_feature])
                        elif tag == predict_tag[j]:
                            for feature in char_feature:
                                W[tag*feature] -= 1
                            if j == 0:
                                W[tag*4] -= 1
                            else:
                                W[tag*predict_tag[j-1]] -= 1
#                 print('W耗时：', time.time()-start)

        # 本轮次训练集和验证集上的结果
        train_predict_sentence_tags = [viterbi_decode(sentence_feature, W) for sentence_feature in x_train[:100]]
        print('训练集准确率：', precision_and_recall(train_predict_sentence_tags[:100], y_train[:100]))
        val_predict_sentence_tags = [viterbi_decode(sentence_feature, W) for sentence_feature in x_val]
        print('验证集准确率：', precision_and_recall(val_predict_sentence_tags, y_val))

print(len(sentence_feature_list))  
sentence_feature_list2 = sentence_feature_list[:40000]
sentence_tag_list2 = sentence_tag_list[:40000]
train(sentence_feature_list, sentence_tag_list, 5, ratio = 0.7)

297959
297959
208571
10000次耗时： 4.76837158203125e-06
i-- 0
10000次耗时： 2.3740899562835693
i-- 10000
10000次耗时： 2.0779740810394287
i-- 20000
10000次耗时： 1.9559738636016846
i-- 30000
10000次耗时： 1.9666359424591064
i-- 40000
10000次耗时： 1.8721880912780762
i-- 50000
10000次耗时： 1.891402006149292
i-- 60000
10000次耗时： 1.8912060260772705
i-- 70000
10000次耗时： 2.078833818435669
i-- 80000
10000次耗时： 1.8802359104156494
i-- 90000
10000次耗时： 1.9765539169311523
i-- 100000
10000次耗时： 1.9043879508972168
i-- 110000
10000次耗时： 1.8886809349060059
i-- 120000
10000次耗时： 1.8028919696807861
i-- 130000
10000次耗时： 1.9215140342712402
i-- 140000
10000次耗时： 2.02199125289917
i-- 150000
10000次耗时： 1.937804937362671
i-- 160000
10000次耗时： 1.860177993774414
i-- 170000
10000次耗时： 1.8545746803283691
i-- 180000
10000次耗时： 1.8961849212646484
i-- 190000
10000次耗时： 1.8013789653778076
i-- 200000
训练集准确率： 0.85
验证集准确率： 0.7147044345997225
10000次耗时： 1.5020370483398438e-05
i-- 0
10000次耗时： 1.9403977394104004
i-- 10000
10000次耗时： 1.963986873626709
i-- 20000
1

In [98]:
a = []
a[10][1] = 10
print(a)

IndexError: list index out of range

In [4]:
import os
with open('/Users/a1/Downloads/intro.txt','r',encoding='utf-8') as f:
    i=0
    for line in f.readlines():
        i+=1
        line = line.strip()
        with open('/Users/a1/Downloads/intro/'+str(i)+'.ann','a+',encoding='utf-8') as wf:
            wf.write('')
