In [2]:
import pickle as pkl
import time

def build_train_data(file_path):   #file_path = 'data/train.txt'
    with open(file_path,'r',encoding='utf-8') as f:
        lines = f.readlines()
    phrase_expel = []
    for i in lines:   #把读到的文件中的 引号 回车 和 空格 删掉
        t1 = i.replace('“  ','')
        t2 = t1.replace('\n','')
        t3 = t2.replace('  ','')
        phrase_expel.append(t3)

    with open('data/generate_pkl/train_data.pkl', 'wb') as f: #把这个处理后的文件当作训练数据
        pkl.dump(phrase_expel, f)   #把文件写成pkl格式

In [3]:
def build_target(file_path):  #生成目标文件
    with open(file_path,'r',encoding='utf-8') as f:
        tmp = f.readlines()  #tmp为每一行的数据

    t = []
    for i in tmp: #删掉引号和回车，这里不删空格，空格是分词的标志
        t1 = i.replace('“  ','')
        t2 = t1.replace('\n','')
        t.append(t2)

    sum_list = []
    for i in t:
        sum_ = ''
        for j in i.split():  #以空格为分割，一个词一个词的提取
            if len(j) == 1: #如果词的长度为1 ，就标记为S -single
                sum_ += 'S'
                continue
            else:
                sum_ += 'B' #如果长度不为1，标记为一个词的开始 begin
                for k in range(1, len(j)):
                    if k == len(j) - 1: #如果是这个词的最后一个，就标记为end
                        sum_ += 'E'
                    else:
                        sum_ += 'M'  #其他情况就是middle
        sum_list.append(sum_)

    with open('data/generate_pkl/target.pkl', 'wb') as f:
        pkl.dump(sum_list, f)

In [4]:
def build_vocab_dict(file_path):  #'data/train_data.pkl'
    vocab_dic = {}
    with open(file_path, 'rb') as f:
        z = pkl.load(f)
        for line in z:
            for hang in line:  #统计词频，按照词多到少排列
                vocab_dic[hang] = vocab_dic.get(hang, 0) + 1
        vocab_dic_sorted = sorted(vocab_dic.items(), key=lambda x: x[1], reverse=True)

    vocab_dic2 = {word_count[0]: idx for idx, word_count in enumerate(vocab_dic_sorted)}
    with open('data/generate_pkl/vocab.pkl', 'wb') as f:
        pkl.dump(vocab_dic2, f)

In [5]:
build_train_data("data/sighan.txt")# 创建原始数据文件，没有进行分词

In [6]:
build_target('data/sighan.txt')# 这个文件数据进行分词了

In [7]:
build_vocab_dict('data/generate_pkl/train_data.pkl')# 统计词频

In [8]:
# pytorch实现双层BP神经网络实现功能
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle as pkl
from tqdm import tqdm

In [9]:
class Config(object):
    # 参数设置类，包含一些相关参数
    def __init__(self):
        self.vocab = pkl.load(open('data/generate_pkl/vocab.pkl', 'rb'))  # 读取词表
        self.train_data = pkl.load(open('data/generate_pkl/train_data.pkl', 'rb'))  # 读取训练数据
        self.target = pkl.load(open('data/generate_pkl/target.pkl', 'rb'))  # 读取标签

        self.learning_rate = 0.000015  # 学习率
        self.epoch = 1  # epoch次数

        self.output_size = 4
        self.embed_dim = 16
        self.hout1 = 32
        self.hout2 = 64

        self.num_layers = 2 # 测试双层LSTM神经网络

In [10]:
# BP神经网络
class Model(nn.Module):
    def __init__(self, output_size, vocab_size, embed_dim,hout1,hout2):
        super(Model, self).__init__()
        #把每一个字都表示为embed_dim维的字向量
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        #隐藏层为全连接层
        self.hid_layer1 = nn.Linear(embed_dim, hout1)
        self.hid_layer2 = nn.Linear(hout1, hout2)
        self.out_layer = nn.Linear(hout2, output_size)

    #self指的是上面的初始化模型参数，in_layer指的是待分词的句子的张量表示
    def forward(self, in_layer):
        #将in_layer张量中的每一个元素（字的序号）都变成一个embed_dim维的张量
        emd = self.embedding(in_layer)
        #将每个字从一个embed_dim维变为hout1维的向量，神经元的出现（w，b）
        h_out1 = self.hid_layer1(emd)
        #将每个字从一个hout1维变为hout2维的向量，神经元的增加（w‘，b’）
        h_out2 = self.hid_layer2(h_out1)
        #非线性变换
        out_ = F.relu(h_out2)
        #将hout2维变为output_size维
        out_ = self.out_layer(out_)
        #每一个字都会得到到一个 为BMES的概率，最大的即为所预测的
        out_ = F.softmax(out_, dim=1)
        return out_

In [11]:
def model_eval(model_out, true_label):
    confusion_matrix = torch.zeros([2, 2], dtype=torch.long)
    predict_label = torch.argmax(model_out, 1)
    accuracy = []
    precision = []
    recall = []
    f_1 = []
    for l in range(4):
        tp_num, fp_num, fn_num, tn_num = 0, 0, 0, 0
        for p, t in zip(predict_label, true_label):
            if p == t and t == l:
                tp_num += 1
            if p == l and t != l:
                fp_num += 1
            if p != l and p != t:
                fn_num += 1
            if p != l and p == t:
                tn_num += 1
        accuracy.append((tp_num + tn_num) / (tp_num + tn_num + fp_num + fn_num))
        try:
            prec = tp_num / (tp_num + fp_num)
        except:
            prec = 0.0
        try:
            rec = tp_num / (tp_num + fn_num)
        except:
            rec = 0
        precision.append(prec)
        recall.append(rec)
        if prec == 0 and rec == 0:
            f_1.append(0)
        else:
            f_1.append((2 * prec * rec) / (prec + rec))
    ave_acc = torch.tensor(accuracy, dtype=torch.float).mean()
    ave_prec = torch.tensor(precision, dtype=torch.float).mean()
    ave_rec = torch.tensor(recall, dtype=torch.float).mean()
    ave_f1 = torch.tensor(f_1, dtype=torch.float).mean()
    return ave_acc, ave_prec, ave_rec, ave_f1

In [12]:
#建议注释掉这一个函数，因为test_  有可能会和内置函数重名了，改个名字也行~
def test_Split(model_):
    text = '在一九九八年来临之际，我十分高兴地通过中央人民广播电台、中国国际广播电台和中央电视台，向全国各族人民，向香港特别行政区同胞、澳门和台湾同胞、海外侨胞，向世界各国的朋友们，致以诚挚的问候和良好的祝愿！'
    hang_ = []
    for wd in text:
        # print(wd) # test
        hang_.append(Config().vocab[wd])
    test_tensor = torch.tensor(hang_, dtype=torch.long)
    res = model_(test_tensor)
    res = res.detach().numpy()
    [print(np.argmax(r), end=",") for r in res]
    print("\n")
    print(res)

In [13]:
#设置参数的起点
torch.manual_seed(1)
config = Config()
voc_size = len(config.vocab)

In [14]:
train_data_list = []
for lin in config.train_data:
    hang = []
    for word in lin:
        hang.append(config.vocab[word])
    #将列表类型转变为张量类型
    train_data_list.append(torch.tensor(hang, dtype=torch.long))
print(train_data_list[:2])

[tensor([1272,  186,  669,  558,  575,  440,    1,   35,  185,  380,  194,  194,
           5,  487,  487,  479,   14,   35,   14,  553,  294,   64,  791,  332,
         362,   10,  247,   65]), tensor([  7, 217,   7, 342, 124, 258, 108,   3,   4,  34,  53, 426, 274, 567,
         30])]


In [15]:
target_dict = {'B': 0,
               'M': 1,
               'E': 2,
               'S': 3}

In [16]:
target_list = []
for lin in config.target:
    hang = []
    for tag in lin:
        hang.append(target_dict[tag])
    target_list.append(torch.tensor(hang, dtype=torch.long))

#现在 train_data_list target_list 里面的元素都是张量，它们是由张量构成的列表
print(target_list[:2])

[tensor([0, 2, 0, 2, 0, 2, 3, 3, 0, 2, 0, 2, 0, 1, 1, 1, 2, 0, 2, 0, 2, 3, 3, 0,
        2, 3, 3, 3]), tensor([0, 1, 1, 2, 0, 1, 2, 3, 0, 2, 0, 2, 3, 0, 2])]


In [17]:
print(len(train_data_list), len(target_list))

19056 19056


In [18]:
model = Model(config.output_size, voc_size, config.embed_dim,config.hout1,config.hout2)
#model __init__()函数已经生效了
losses = []
acc = []
rec = []
prec = []
f1 = []
optimizer = torch.optim.SGD(model.parameters(), lr=config.learning_rate)# 随机梯度下降优化器
#model.parameters()这里的参数就是__init__()函数里面的
loss_f = nn.CrossEntropyLoss()# 交叉熵损失函数

In [19]:
# 将代码上传到GPU进行计算
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
train_data_list = [i.to(device) for i in train_data_list]
target_list = [i.to(device) for i in target_list]

In [21]:
#实现进度条式输出
for i in tqdm(range(config.epoch)):
    for j, k in enumerate(train_data_list):
        optimizer.zero_grad()
        #开始forward的作用，预测句子k的标签
        out = model(k)
        #计算交叉熵损失
        loss = loss_f(out, target_list[j])
        #误差反向传播
        loss.backward()
        #参数向前更新
        optimizer.step()
        #模型评价
        acc_, prec_, rec_, f1_ = model_eval(out, target_list[j])
        acc.append(acc_.item())
        prec.append(prec_.item())
        rec.append(rec_.item())
        f1.append(f1_.item())
        print("\r训练数据进度：{:.2f}%\t".format((j + 1) / len(train_data_list) * 100), end='')
        print('acc: ' + str(acc_.item()) + '\tprec: ' + str(prec_.item()) +'\trec: ' + str(rec_.item()) + '\tf1: ' + str(f1_.item()), end='')
        losses.append(loss.item())
    #保存当前的模型参数
    torch.save(model, './model/cut_BP.bin')
    print('\nacc: ' + str(torch.tensor(acc).mean().item()) + '\tprec: ' + str(torch.tensor(prec).mean().item())
          +'\trec: ' + str(torch.tensor(rec).mean().item()) + '\tf1: ' + str(torch.tensor(f1).mean().item()))

  0%|          | 0/1 [00:00<?, ?it/s]

训练数据进度：29.64%	acc: 0.09090909361839294	prec: 0.032608695328235626	rec: 0.057692307978868484	f1: 0.04166666790843017

  0%|          | 0/1 [17:44<?, ?it/s]

训练数据进度：29.67%	acc: 0.4000000059604645	prec: 0.1666666716337204	rec: 0.125	f1: 0.1428571492433548.06839131563901901




ZeroDivisionError: division by zero

In [160]:
print(target_list[6])

tensor([0, 1, 1, 1, 2, 3, 3, 0, 2, 0, 2, 0, 2, 3, 0, 2, 0, 2, 3, 3, 3, 0, 2, 3,
        3, 3, 3, 0, 2, 0, 2, 0, 2, 0, 2, 3, 0, 2, 0, 2, 3, 0, 2, 3, 0, 2, 3, 0,
        2, 3, 0, 2, 0, 2, 0, 1, 1, 2, 0, 2, 0, 2, 0, 2, 3, 0, 2, 0, 2, 0, 2, 0,
        2, 3, 0, 2, 0, 2, 0, 2, 3, 3, 0, 2, 0, 1, 1, 2, 3, 3, 0, 1, 1, 2, 3, 3,
        0, 2, 0, 2, 3, 0, 2, 0, 2, 0, 2, 3, 0, 2, 0, 2, 3, 0, 2, 0, 1, 2, 0, 2,
        3, 0, 2, 3, 0, 1, 2, 3, 0, 2, 0, 1, 1, 2, 3, 0, 2, 0, 1, 1, 1, 2, 0, 2,
        0, 2, 3, 0, 2, 0, 2, 0, 2, 3, 0, 2, 3, 3, 0, 2, 3, 0, 2, 3, 0, 2, 3, 0,
        2, 0, 2, 3, 0, 2, 0, 2, 3], device='cuda:0')


In [170]:
# 将代码上传到CPU进行测试
device = torch.device("cpu")
model.to(device)
train_data_list = [i.to(device) for i in train_data_list]
target_list = [i.to(device) for i in target_list]
#对模型的测试，可以直观地看出分词效果
test_Split(model)

3,3,0,0,3,3,0,3,3,3,0,1,0,2,0,0,2,1,2,1,0,3,0,1,3,0,3,0,1,3,3,3,1,3,0,3,2,1,0,0,3,3,0,0,0,3,0,2,3,0,0,0,2,3,3,3,0,3,0,3,1,0,3,0,2,3,0,3,1,0,3,2,0,1,0,0,0,0,0,3,2,3,2,2,0,0,0,2,2,2,0,3,2,3,0,2,0,0,0,

[[0.23297426 0.23496735 0.26104796 0.2710104 ]
 [0.25520328 0.24752448 0.23167285 0.26559937]
 [0.2721794  0.20566845 0.26142666 0.2607255 ]
 [0.2721794  0.20566845 0.26142666 0.2607255 ]
 [0.24144779 0.24482967 0.25675753 0.25696504]
 [0.2530907  0.23891412 0.2426706  0.2653246 ]
 [0.2550116  0.25013378 0.24567948 0.2491752 ]
 [0.24758793 0.22256447 0.24551104 0.28433663]
 [0.25040168 0.21062246 0.24540725 0.29356858]
 [0.25307012 0.22387812 0.25282416 0.27022767]
 [0.27516353 0.25398266 0.22589725 0.24495658]
 [0.2471822  0.26378596 0.24364814 0.24538374]
 [0.34266287 0.23132461 0.253674   0.17233846]
 [0.26627287 0.26116478 0.26640537 0.20615698]
 [0.3164399  0.24557808 0.2449774  0.1930046 ]
 [0.27278227 0.2530443  0.24716082 0.22701263]
 [0.25496632 0.21964125 0.29120457 0.23418784]
 