# 将分词器bug修复后,继续将词表压缩到262144

In [1]:
import os
import re
import time
import json
import numpy as np
from tqdm import tqdm
import concurrent.futures
from collections import Counter,defaultdict

In [2]:
%%time
#加载词表，只使用词频特征
pattern = re.compile(r'^[\u4e00-\u9fff]+$')
words_count = defaultdict(int)
with open('vocabs/vocab_b_338724.txt','r',encoding='utf-8') as f:
    for word in f:
        if word[0]!='\t':
            k,v = word.split('\t')
            v = int(v[:-1])
        if len(k)==1 or pattern.match(k):
            words_count[k] += v
print('候选词表大小:',len(words_count))

候选词表大小: 338723
CPU times: user 450 ms, sys: 20.1 ms, total: 471 ms
Wall time: 468 ms


In [3]:
%%time
N = 7 #最长的n-gram片段
count_sum = [0 for _ in range(N)]
words = [word for word in words_count]
for word in words:
    count_sum[len(word)-1] += words_count[word]

CPU times: user 83.9 ms, sys: 1.9 ms, total: 85.8 ms
Wall time: 85.5 ms


In [4]:
T = 12
vocab_init   = 5690450
vocab_target = 262144
gap = (vocab_init/vocab_target)**(1/T)
remain_list = [int(vocab_init/gap**i+0.5) for i in range(1,T+1)]
thr = 0.0
#对所有词汇的词频进行排序，用于确定筛选阈值
v_list = list(words_count.values())
v_list.sort()
#在遇到第一个可用阈值时停止
for r in remain_list:
    if r < len(words_count):
        thr = v_list[-r]
        print("过滤阈值：",thr,"预计词表大小：",r)
        break

过滤阈值： 3877 预计词表大小： 262144


In [5]:
%%time
import ahocorasick as ah
aca= ah.Automaton()
for word in words:
    #构造自动机时对词频进行过滤
    if words_count[word] > thr:
        aca.add_word(word.encode(),(len(word.encode()),np.log(words_count[word]/count_sum[len(word)-1])))
aca.make_automaton()

CPU times: user 651 ms, sys: 19 ms, total: 670 ms
Wall time: 672 ms


In [6]:
def tokenizer(encode_text,alpha=1.0):
    #路径，记录起始位置和分值
    LOT = len(encode_text)
    BOW = 0 #表示最佳词的起始位置
    VOW = 1 #表示最佳路径的累积值
    VOID = 5 #表示没有记录
    routes = [(i,VOID) for i in range(LOT)] + [(-1,0.0)]
    tokens = []  #保存分词结果
    #遍历所有匹配成功的词
    # low:len_of_word
    # vow:value_of_word
    for eow, (low,vow) in aca.iter(encode_text):
        #匹配词起点序号 = 匹配词终点序号 -（匹配词长度-1）
        bow = eow - low + 1
        #得分是负数，但负的程度越小约好，
        #数值为起始位置的得分 + 当前词的分数
        #起始位置无记录就往前找
        i = 0
        while routes[bow - 1 - i][VOW] == VOID:
            i += 1
        v = routes[bow - 1 -i][VOW] + vow
        # 超过5.0直接使用确定算法
        if alpha >= 5.0:
            #更短的路径或第一个到达，更新
            if v > routes[eow][VOW] and i == 0 or routes[eow][VOW] == VOID:
                routes[eow] = bow,v #记录起始位置以及累积值
        else:
            # 随机算法
            if routes[eow][VOW] == VOID:
                base = v
                temp = 1.0
                denominator = 1.0
                routes[eow] = bow,v
            else:
                temp = np.exp(alpha * (v - base))
                denominator += temp
                if np.random.rand() < temp/denominator and i == 0:
                    routes[eow] = bow,v #记录起始位置以及累积值
    #     print(w,vow,v)
    # print([str(item[1])[:5] for item in routes])
    #从后往前查找分割点
    eow = LOT - 1
    while encode_text:
        bow = routes[eow][BOW] #找到最佳词的起始位置
        tokens.append(encode_text[bow:eow+1]) #记录该词语
        encode_text,eow = encode_text[:bow],bow - 1 #继续分上一个词
    # tokens = tokens[::-1] #从后往前找，需要反序得到正序的分词结果注释掉加速
    return tokens

In [7]:
#对多个字符串计算字节数与token数量
def get_words_counter(i):
    global str_lists
    temp_counter = Counter()
    for encode_text in str_lists[i]:
        temp_counter.update(tokenizer(encode_text,1.0))
    return temp_counter

In [8]:
#获取数据集中的全部文件
batch = 10000
new_counter = Counter()
str_list = []
str_len = 0
cnt = 0
with open('high_data.txt','r',encoding='utf-8') as f:
    for item in f:
        str_list += [item.encode()]
        str_len  += len(item)
        if str_len > 1e9:
            str_lists = [str_list[i*batch:(i+1)*batch] for i in range(len(str_list)//batch + (len(str_list)%batch != 0))]
            str_list = []
            str_len = 0
            cnt += 1
            #分词并记录词频
            with concurrent.futures.ProcessPoolExecutor(max_workers=15) as executor:
                future_t_len = [executor.submit(get_words_counter,i) for i in range(len(str_lists))]
                with tqdm(desc=str(cnt),total=len(future_t_len)) as pbar:
                    for future in concurrent.futures.as_completed(future_t_len):
                        new_counter.update(future.result())
                        pbar.update(1)

#收尾工作，剩余数据可能凑不齐一个批次
str_lists = [str_list[i*batch:(i+1)*batch] for i in range(len(str_list)//batch + (len(str_list)%batch != 0))]
str_list = []
str_len = 0
cnt += 1
with concurrent.futures.ProcessPoolExecutor(max_workers=15) as executor:
    future_t_len = [executor.submit(get_words_counter,i) for i in range(len(str_lists))]
    with tqdm(desc=str(cnt),total=len(future_t_len)) as pbar:
        for future in concurrent.futures.as_completed(future_t_len):
            new_counter.update(future.result())
            pbar.update(1)

#整理词表
dict_to_write = dict()
for k,v in new_counter.items():
    try:
        k = k.decode()
        if k != '\r' and k != '\n':
            dict_to_write[k] = v
    except:
        pass
#写入文件
with open("vocabs/vocab_b_"+str(len(dict_to_write))+".txt","w",encoding='utf-8') as f:
    for k,v in dict_to_write.items():
        print(k+'\t'+str(v),file=f)

1: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06:29<00:00,  6.71s/it]
2: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06:45<00:00,  7.00s/it]
3: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06:39<00:00,  6.88s/it]
4: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06:31<00:00,  6.76s/it]
5: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06:28<00:00,  6.69s/it]
6: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06:32<00:00,  6.76s/it]
7: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 58/58 [06