#### 分词

##### 导包

In [4]:
import glob
import os
import time
from hmmlearn import hmm
import jieba
import jieba.posseg as pseg
from collections import Counter
import numpy as np
import pandas as pd

##### 设置停用词

In [5]:
# -*- coding: utf-8 -*-
stop_dir = 'stopwords.txt' # 停用词
stop = ['“' , '”' , '、' , '(' , ')'  , '，' , '|' ,
        '（' , '）' , '年' , '月' , '日' , '公告' , '采购' , '项目' ,
        '中心', '招标', '货物' , '通知' , '学类' , '研究' , '类' , '工程' ,
        '竞争性']
with open(stop_dir , 'r' , encoding='utf8') as f:
    # 设置停用词
    for i in f.readlines():
        s = i[:len(i) - 1]
        stop.append(s)
    f.close()

##### 读取文件

In [6]:
def get_content(dic_dir):
    content = ''
    # 读文件
    with open(dic_dir , 'r' , encoding='utf8') as f:
        for line in f:
            line = line.strip()
            content += line
        f.close()
    return content

dic_dir = 'total_context.txt' # 文本
files = glob.glob(dic_dir)
text = [get_content(x) for x in files][0]
for j in stop:
    text = text.replace(j , '')
corpus = text.split('。') # 需要拿句号进行分句子

with open('sort_context.txt' , 'w' , encoding='utf8') as f:
    for i in corpus:
        f.write(i + '\n')
    f.close()

##### jieba分词

In [7]:
def jieba_spilt(corpus):
    split_words = []
    for i in range(len(corpus)):
        for x in jieba.cut(corpus[i]): # 分词
            split_words.append(x)

    point_split_words = []
    for i in range(len(corpus)):
        for x in pseg.cut(corpus[i]): # 词性标注
            point_split_words.append(x)

    return split_words , point_split_words

#### 统计词频

In [8]:
split_words , _ = jieba_spilt(corpus)
d = Counter(split_words)
# 从大到小一排序
d = sorted(d.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
print(d)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\xht\AppData\Local\Temp\jieba.cache
Loading model cost 0.679 seconds.
Prefix dict has been built successfully.


[('中国民航', 30609), ('大学', 30462), ('学报', 7214), ('机场', 6719), ('服务', 6506), ('公开', 5117), ('分享', 5052), ('大', 4213), ('中航', 4051), ('分析', 3616), ('天津', 3265), ('影响', 3256), ('学校', 3251), ('基', 3117), ('建设', 3109), ('平台', 3075), ('航空', 3015), ('系统', 2922), ('应力', 2840), ('混凝土', 2816), ('道', 2752), ('校友', 2714), ('面板', 2676), ('中', 2607), ('更', 2579), ('理', 2511), ('冻土', 2508), ('创新', 2502), ('发展', 2425), ('模型', 2412), ('推荐', 2369), ('天津市', 2330), ('道面', 2308), ('民航', 2273), ('中国', 2271), ('试验', 2248), ('学生', 2225), ('学院', 2171), ('模拟', 2143), ('文', 2107), ('教育', 2046), ('科技', 2012), ('校区', 2011), ('微博', 1994), ('磋商', 1984), ('信息', 1961), ('飞机', 1951), ('技术', 1894), ('文章', 1878), ('工作', 1861), ('剪切', 1804), ('号', 1758), ('企业', 1721), ('更正', 1712), ('优化', 1700), ('场', 1678), ('温度', 1644), ('数值', 1644), ('波速', 1636), ('区', 1564), ('分号', 1548), ('路面', 1524), ('关', 1494), ('微信', 1484), ('活动', 1480), ('国际', 1478), ('全国', 1473), ('碳', 1456), ('科学', 1451), ('评价', 1446), ('相关', 1439), ('实验室', 141

In [9]:
# 统计前50
if 'dict.txt' in os.listdir('D:\pro_of_program\Python\\NLP\experiment1'):
    os.remove('D:\pro_of_program\Python\\NLP\experiment1\dict.txt')
for i in range(0 , 50 , 1):
    with open('dict.txt' , 'a' , encoding='utf8') as f:
        f.write(d[i][0] + ',' + str(d[i][1]) + '\n')
        f.close()

##### 正向最大匹配(MM)

In [10]:
class MM:
    def __init__(self, dic_path):
        self.dictionary = set() # 字典
        self.maximum = 0 # 字典中的最大长度
        with open(dic_path, 'r', encoding='utf8') as f:
            for line in f.readlines():
                s = line.split(',')[0]
                self.dictionary.add(s)
                self.maximum = max(self.maximum, len(s))

    def cut(self, text):
        result = []
        index = 0
        length = len(text)

        while index < length - 1:
            # 每次分词前需要设置一个flag，如果本轮分词失败就跳过一个索引
            word = None
            for size in range(self.maximum , 0 , -1): # 字典中的最大长度往下枚举
                if index + size > length:
                    continue
                piece = text[index: index + size] # 从头截取size长度
                if piece in self.dictionary: # 在里面加入即可
                    word = piece
                    result.append(word)
                    index += size
                    break
            if word is None: # 字典中没有直接结束
                result.append(text[index])
                index += 1

        return result

##### 获取MM分词结果

In [11]:
MMMM = MM('dict.txt')
res = []
for i in corpus:
    temp = MMMM.cut(i)
    for j in temp:
        res.append(j)
d1 = Counter(res)
d1 = sorted(d1.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
print(d1)

[('大学', 33383), ('中国民航', 31164), ('学', 12448), ('基', 11066), ('大', 10410), ('化', 10003), ('文', 9909), ('度', 9083), ('数', 8873), ('教', 8069), ('国', 7883), ('期', 7750), ('理', 7746), ('会', 7667), ('学报', 7462), ('科', 7327), ('机场', 7231), ('新', 7035), ('服务', 6965), ('业', 6885), ('校', 6816), ('中', 6753), ('关', 6147), ('高', 6066), ('实', 5963), ('动', 5705), ('开', 5575), ('路', 5358), ('更', 5344), ('机', 5209), ('合', 5156), ('力', 5142), ('公开', 5124), ('工', 5057), ('建', 5055), ('分享', 5052), ('性', 4975), ('生', 4971), ('导', 4857), ('成', 4824), ('部', 4810), ('道面', 4792), ('网', 4784), ('发', 4690), ('航', 4651), ('号', 4590), ('进', 4541), ('道', 4535), ('水', 4436), ('场', 4410), ('速', 4403), ('园', 4381), ('公', 4341), ('温', 4312), ('分', 4311), ('土', 4244), ('时', 4236), ('中航', 4151), ('维', 4143), ('面', 4140), ('图', 4118), ('计', 4117), ('区', 4112), ('分析', 4040), ('方', 4035), ('电', 3943), ('体', 3937), ('验', 3924), ('研', 3882), ('科技', 3875), ('行', 3854), ('航空', 3805), ('设', 3787), ('空', 3775), ('法', 3747), ('版'

##### 反向最大匹配

In [12]:
class RMM:
    def __init__(self, dic_path):
        self.dictionary = set() # 字典
        self.maximum = 0 # 字典中的最大长度
        with open(dic_path, 'r', encoding='utf8') as f:
            for line in f.readlines():
                s = line.split(',')[0]
                self.dictionary.add(s)
                self.maximum = max(self.maximum, len(s))

    def cut(self, text):
        result = []

        while len(text) >= 1:
            # 每次分词前需要设置一个flag，如果本轮分词失败就跳过一个索引
            word = None
            for size in range(self.maximum , 0 , -1): # 字典中的最大长度往下枚举
                piece = text[-size:]
                # print(piece , end = ' ')
                if piece in self.dictionary: # 在里面加入即可
                    word = piece
                    result.append(word)
                    text = text[:-size]
                    break
            if word is None: # 字典中没有直接结束
                result.append(text[-1:])
                text = text[:-1]

        return result

##### 获取分词结果

In [13]:
RMMMM = RMM('dict.txt')
res = []
for i in corpus:
    temp = RMMMM.cut(i)
    for j in temp:
        res.append(j)
d1 = Counter(res)
d1 = sorted(d1.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
print(d1)

[('大学', 32030), ('中国民航', 30538), ('学', 12452), ('大', 11865), ('基', 11160), ('化', 10166), ('文', 9909), ('度', 9094), ('数', 8873), ('教', 8069), ('国', 7883), ('理', 7842), ('会', 7777), ('期', 7762), ('学报', 7462), ('科', 7327), ('机场', 7231), ('新', 7035), ('服务', 6965), ('业', 6934), ('中', 6800), ('道', 6715), ('校', 6424), ('关', 6147), ('高', 6066), ('动', 6010), ('实', 5963), ('开', 5688), ('路', 5358), ('更', 5344), ('力', 5234), ('机', 5209), ('合', 5156), ('公开', 5124), ('性', 5090), ('工', 5057), ('建', 5055), ('分享', 5052), ('导', 4857), ('成', 4824), ('部', 4817), ('网', 4784), ('发', 4734), ('航', 4663), ('航空', 4628), ('号', 4610), ('进', 4541), ('水', 4436), ('场', 4410), ('速', 4403), ('园', 4403), ('公', 4341), ('温', 4312), ('分', 4311), ('土', 4256), ('时', 4236), ('中航', 4151), ('维', 4143), ('面', 4140), ('图', 4122), ('区', 4120), ('计', 4117), ('分析', 4040), ('方', 4035), ('生', 4034), ('验', 3968), ('电', 3943), ('体', 3937), ('研', 3918), ('行', 3906), ('科技', 3875), ('设', 3787), ('值', 3786), ('法', 3755), ('版', 3735), ('通',

In [8]:
import pickle
import os

class HMM:
    def __init__(self):
        self.model_file = './data/hmm_model.pkl'  # 保存训练结果的路径
        self.state_list = ['B', 'M', 'E', 'S']  # 隐藏状态列表
        self.load_para = False  # 是否需要重新加载model_file

        self.A_dic = {}  # 状态转移概率(隐藏状态->隐藏状态)
        self.B_dic = {}  # 发射概率(隐藏状态->词语/观察状态)
        self.Pi_dic = {}  # 隐藏状态的初始概率

    def try_load_model(self, trained):
        print(trained)
        if trained:
            # 二进制模式的文件IO不指定编码方式
            with open(self.model_file, 'rb') as f:
                self.A_dic = pickle.load(f)  # 若之前有dump多次，可以load多次
                self.B_dic = pickle.load(f)  # load操作不影响f
                self.Pi_dic = pickle.load(f)
                self.load_para = True

    def train(self, path):
        # self.try_load_model(False)  # 重置概率矩阵，因为要重新训练
        count_dic = {}  # 统计状态出现的次数，用于求P(o)

        def init_parameter():
            for state in self.state_list:
                # 从状态state出发，到各个状态s的转移概率
                self.A_dic[state] = {s: 0.0 for s in self.state_list}
                # 从状态state发射到输出符号的概率，由于输出的词未定，所以空字典
                self.B_dic[state] = {}
                # 初始概率向量。字典中的元素为数，而不是字典。与上面两个矩阵不同
                self.Pi_dic[state] = 0.0

                count_dic[state] = 0

        def make_label(word):
            label = []
            if len(word) == 1:
                label.append('S')
            else:
                label += ['B'] + ['M'] * (len(word)-2) + ['E']
            return label

        init_parameter()  # 初始化A,B,Pi以及count
        line_num = 0  # 用于计算概率时作为分母，书中为0是错误的
        last_label = None
        with open(path, encoding='utf8') as f:
            for line in f:
                line_num += 1
                line = line.strip()
                if not line:
                    continue

                # 把每一个字加入到字的集合中，在计算发射概率时有用
                word_list = [i for i in line if i != ' ']
                # 词典中的词的列表，是待标记的观察序列
                line_list = line.split()
                # 记录每一行标记结果(隐藏状态序列)
                line_state = []
                for w in line_list:
                    line_state.extend(make_label(w))

                # 确保所有字都做上标记
                assert len(line_state) == len(word_list)

                # 更新每一行的最后一个标记，为之后的计算转移概率做准备
                last_label = line_state[-1]

                for k, v in enumerate(line_state):
                    if k < (len(line_state) - 1):
                        count_dic[v] += 1.0
                    # 记录每一个隐藏状态的出现次数
                    # 由于初始化时不知道有什么词，所以发射概率矩阵并未对
                    # 每一个词所在项初始化为0，因此不能简单+=，需要手动+1
                    self.B_dic[v][word_list[k]] =\
                        self.B_dic[v].get(word_list[k], 0) + 1.0
                    if k == 0:  # 初始状态
                        self.Pi_dic[v] += 1
                    else:
                        # 由于初始化时有把转移概率矩阵的每一项初始化为0，所以+=
                        self.A_dic[line_state[k-1]][v] += 1.0

        # 根据频度计算概率
        self.Pi_dic = {k: v*1.0 / line_num for k, v in self.Pi_dic.items()}

        # 用于计算发射概率。记录每个隐藏状态发射到输出符号(词)的总数
        sum_of_B_dic = {}
        for k, v in self.B_dic.items():
            sum_of_B_dic[k] = sum(v1 for k1, v1 in v.items())

        temp = {k: {k1: v1 / (count_dic[k] + 0.001) for k1, v1 in v.items()}
                for k, v in self.A_dic.items() if k != last_label}
        for k, v in temp.items():
            self.A_dic[k] = v
        # 对于隐藏结尾状态，应该将分母减一，否则概率和不为1
        self.A_dic[last_label] = {k: v / (count_dic[last_label] - 1.0)
                                  for k, v in self.A_dic[last_label].items()
                                  if len(self.A_dic[last_label].items()) > 0}

        # 拉普拉斯平滑，分子加一，分母加 分子个数*1
        # 书中分母不对，应该是每个隐藏状态有自己的分母，分母是由该状态发射出去的
        # 总数
        self.B_dic = {k: {k1: (v1 + 1) / (sum_of_B_dic[k] + len(v.items()))
                          for k1, v1 in v.items()}
                      for k, v in self.B_dic.items()}

        # print(self.Pi_dic, self.A_dic, self.B_dic)
        with open(self.model_file, 'wb') as f:
            pickle.dump(self.A_dic, f)
            pickle.dump(self.B_dic, f)
            pickle.dump(self.Pi_dic, f)

        return self

    def cut(self , text):
        state = self.viterbi(text , ['S', 'B', 'M', 'E'] , self.Pi_dic , self.B_dic , self.A_dic , 1e-6)
        cut_res = []
        begin = 0
        for i, ch in enumerate(text):
            if state[i] == 'B':
                begin = i
            elif state[i] == 'E':
                cut_res.append(text[begin:i + 1])
            elif state[i] == 'S':
                cut_res.append(text[i])
        return cut_res

    def viterbi(self , text , state_list , start_p , emit_p , trans_p ,smooth):
        """Viterbi 算法
        Args:
            text (string): 句子
        Returns:
            list: 最优标注序列
        """
        text = list(text)
        dp = pd.DataFrame(index = state_list)
        # 初始化 dp 矩阵 (prop，last_state)
        dp[0] = [(start_p[s] * emit_p[s].get(text[0], smooth),
                  '_start_') for s in state_list]
        # 动态规划地更新 dp 矩阵
        for i, ch in enumerate(text[1:]):  # 遍历句子中的每个字符 ch
            dp_ch = []
            for s in state_list:  # 遍历当前字符的所有可能状态
                emit = emit_p[s].get(ch, smooth)
                # 遍历上一个字符的所有可能状态，寻找经过当前状态的最优路径
                (prob, last_state) = max([
                    (dp.loc[ls, i][0] * trans_p[ls].get(s, smooth) *
                     emit, ls) for ls in state_list
                ])
                dp_ch.append((prob, last_state))

            # frames = pd.DataFrame(pd.Series(dp_ch) , columns = i + 1)
            # dp = pd.concat([dp , frames], axis=1)
            dp[i + 1] = dp_ch.copy()
        # 回溯最优路径
        path = []
        end = list(dp[len(text) - 1])
        back_point = state_list[end.index(max(end))]
        path.append(back_point)
        for i in range(len(text) - 1, 0, -1):
            back_point = dp.loc[back_point, i][1]
            path.append(back_point)
        path.reverse()
        return path

##### 打印矩阵

In [9]:
HMMMM = HMM()
HMMMM.train('split_word.txt')
print('状态转移概率矩阵：' , HMMMM.A_dic)
print('发射概率矩阵：' , HMMMM.B_dic)
print('初始状态概率矩阵：' , HMMMM.Pi_dic)

状态转移概率矩阵： {'B': {'B': 0.0, 'M': 0.2021874953016417, 'E': 0.7978125031398688, 'S': 0.0}, 'M': {'B': 0.0, 'M': 0.3097289084054968, 'E': 0.6902710862737977, 'S': 0.0}, 'E': {'B': 0.8062246257665844, 'M': 0.0, 'E': 0.0, 'S': 0.1937769327275986}, 'S': {'B': 0.9409770239453815, 'M': 0.0, 'E': 0.0, 'S': 0.05902296848660107}}
发射概率矩阵： {'B': {'学': 0.039874995916678466, '前': 0.0005257762588685855, '成': 0.004374209585616753, '国': 0.00782753294268261, '委': 0.0011542188878121019, '航': 0.008421753448267816, '民': 0.005433539858662631, '更': 0.0042248766836895804, '中': 0.0636889271208772, '教': 0.013212850718431257, '批': 0.0003655544995092236, '发': 0.007922421557448834, '直': 0.0003017769059778272, '领': 0.0015726621234205323, '亲': 9.333306370448263e-05, '关': 0.003641545035536564, '建': 0.009753305157118435, '伊': 0.00010733302326015503, '毛': 0.00010422192113667227, '主': 0.0033724347018553058, '命': 0.00020844384227334455, '槐': 1.2444408493931018e-05, '校': 0.013111739899418069, '周': 0.0003577767442005168, '总'

##### viterbi获取最优标注序列

In [10]:
test = '学校前身成立中国民革命军事委员会民航空局二民航学校更名中国民航空学院教育部批准更名中国民航大学'
path = HMMMM.viterbi(test , HMMMM.state_list , HMMMM.Pi_dic , HMMMM.B_dic , HMMMM.A_dic , 1e-6)
print(path)

['B', 'E', 'B', 'E', 'B', 'E', 'B', 'M', 'M', 'E', 'B', 'E', 'S', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'B', 'E', 'B', 'M', 'M', 'E', 'S', 'B', 'E', 'B', 'M', 'E', 'B', 'E', 'B', 'E', 'B', 'M', 'M', 'E', 'B', 'E']


##### HMM分词

In [11]:
res = HMMMM.cut(test)
print(res)

['学校', '前身', '成立', '中国民革', '命军', '事', '委员', '会民航', '空局', '二民航', '学校', '更名', '中国民航', '空', '学院', '教育部', '批准', '更名', '中国民航', '大学']
