In [1]:
max_len = 32

In [2]:
import pickle as pkl
import re
from tqdm import tqdm
import numpy as np
import time

def build_train_data(file_path):   #file_path = 'data/train.txt'
    with open(file_path,'r',encoding='utf-8') as f:
        lines = f.read()
    
    # 将读取到的文本按照标点符号和换行符进行切分，得到一个列表
    lines = re.split(r"[，。！？、（）—《》…；“”\n]",lines)

    # 去掉列表中的空字符串及空格
    lines = [line.strip() for line in lines if line.strip()]

    # 去掉每个句子中的空格
    lines = [line.replace(" ","") for line in lines]
    
    # 去掉所有长度大于max_len的句子
    lines = [line for line in lines if len(line)<=max_len]

    phrase_expel = lines

    print("phrase_expel:",phrase_expel[:10])
    print("phrase_expel len:",len(phrase_expel))

    with open('data/generate_pkl/train_data.pkl', 'wb') as f: #把这个处理后的文件当作训练数据
        pkl.dump(phrase_expel, f)   #把文件写成pkl格式

In [3]:
build_train_data("data/sighan.txt")# 创建原始数据文件，没有进行分词

phrase_expel: ['迈向充满希望的新世纪', '一九九八年新年讲话', '附图片１张', '中共中央总书记', '国家主席江泽民', '一九九七年十二月三十一日', '１２月３１日', '中共中央总书记', '国家主席江泽民发表１９９８年新年讲话', '迈向充满希望的新世纪']
phrase_expel len: 159671


In [4]:
def build_target(file_path):  #生成目标文件,生成字母文件
    with open(file_path,'r',encoding='utf-8') as f:
        tmp = f.read()
    
    # 将读取到的文本按照标点符号和换行符进行切分，得到一个列表
    tmp = re.split(r"[，。！？、（）—《》…；“”\n]",tmp)

    # 去掉列表中的空字符串
    tmp = [line.strip() for line in tmp if line.strip()]

    # 去掉每个句子首尾的空格
    tmp = list(map(lambda x:x.strip(),tmp))

    print("tmp:",tmp[:10]) # 打印前10个句子

    sum_list = []
    for i in tmp:
        sum_ = ''
        for j in i.split():  #以空格为分割，一个词一个词的提取
            if len(j) == 1: #如果词的长度为1 ，就标记为S -single
                sum_ += 'S'
                continue
            else:
                sum_ += 'B' #如果长度不为1，标记为一个词的开始 begin
                for k in range(1, len(j)):
                    if k == len(j) - 1: #如果是这个词的最后一个，就标记为end
                        sum_ += 'E'
                    else:
                        sum_ += 'M'  #其他情况就是middle
        if len(sum_) <= max_len:
            # 如果句子长度小于等于max_len，就用"N"填充
            sum_ += "N"*(max_len-len(sum_))
            sum_list.append(sum_)

    print("sum_list:",sum_list[:10]) # 打印前10个句子
    print("sum_list len:",len(sum_list))

    with open('data/generate_pkl/target.pkl', 'wb') as f:
        pkl.dump(sum_list, f)

In [5]:
build_target('data/sighan.txt')# 这个文件数据进行分词了

tmp: ['迈向  充满  希望  的  新  世纪', '一九九八年  新年  讲话', '附  图片  １  张', '中共中央  总书记', '国家  主席  江泽民', '一九九七年  十二月  三十一日', '１２月  ３１日', '中共中央  总书记', '国家  主席  江  泽民  发表  １９９８年  新年  讲话', '迈向  充满  希望  的  新  世纪']
sum_list: ['BEBEBESSBENNNNNNNNNNNNNNNNNNNNNN', 'BMMMEBEBENNNNNNNNNNNNNNNNNNNNNNN', 'SBESSNNNNNNNNNNNNNNNNNNNNNNNNNNN', 'BMMEBMENNNNNNNNNNNNNNNNNNNNNNNNN', 'BEBEBMENNNNNNNNNNNNNNNNNNNNNNNNN', 'BMMMEBMEBMMENNNNNNNNNNNNNNNNNNNN', 'BMEBMENNNNNNNNNNNNNNNNNNNNNNNNNN', 'BMMEBMENNNNNNNNNNNNNNNNNNNNNNNNN', 'BEBESBEBEBMMMEBEBENNNNNNNNNNNNNN', 'BEBEBESSBENNNNNNNNNNNNNNNNNNNNNN']
sum_list len: 159671


In [6]:
def build_vocab_dict(file_path):  #'data/train_data.pkl'
    vocab_dic = {}
    with open(file_path, 'rb') as f:
        z = pkl.load(f)
        for line in z:
            for hang in line:  #统计词频，按照词多到少排列
                vocab_dic[hang] = vocab_dic.get(hang, 0) + 1
        vocab_dic_sorted = sorted(vocab_dic.items(), key=lambda x: x[1], reverse=True)

    # 按照词频排序后，构建词典，词频越高，索引越小，并且下标从1开始
    vocab_dic2 = {}
    for i, j in enumerate(vocab_dic_sorted):
        vocab_dic2[j[0]] = i + 1

    # 展示前10个词
    print("vocab_dic2:",list(vocab_dic2.items())[:10])
    print("vocab_dic2 len:",len(vocab_dic2))

    with open('data/generate_pkl/vocab.pkl', 'wb') as f:
        pkl.dump(vocab_dic2, f)

In [7]:
build_vocab_dict('data/generate_pkl/train_data.pkl')# 统计词频

vocab_dic2: [('的', 1), ('一', 2), ('国', 3), ('在', 4), ('中', 5), ('人', 6), ('了', 7), ('是', 8), ('１', 9), ('和', 10)]
vocab_dic2 len: 4665


In [8]:
# pytorch实现双层BP神经网络实现功能
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import pickle as pkl
from tqdm import tqdm

``` python
#BP神经网络

class Model(nn.Module):

    def __init__(self, output_size, vocab_size, embed_dim,hout1,hout2):
        super(Model, self).__init__()
        #把每一个字都表示为embed_dim维的字向量
        # self.input = nn.input(vocab_size, embed_dim)  # 输入层
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        #隐藏层为全连接层
        self.hid_layer1 = nn.Linear(embed_dim, hout1)
        self.hid_layer2 = nn.Linear(hout1, hout2)
        self.out_layer = nn.Linear(hout2, output_size)

    #self指的是上面的初始化模型参数，in_layer指的是待分词的句子的张量表示
    def forward(self, in_layer):
        #将in_layer张量中的每一个元素（字的序号）都变成一个embed_dim维的张量
        emd = self.embedding(in_layer)
        #将每个字从一个embed_dim维变为hout1维的向量，神经元的出现（w，b）
        h_out1 = self.hid_layer1(emd)
        #将每个字从一个hout1维变为hout2维的向量，神经元的增加（w‘，b’）
        h_out2 = self.hid_layer2(h_out1)
        #非线性变换
        out_ = F.relu(h_out2)
        #将hout2维变为output_size维
        out_ = self.out_layer(out_)
        #每一个字都会得到到一个 为BMES的概率，最大的即为所预测的
        out_ = F.softmax(out_, dim=1)
        return out_
```


In [9]:
class Config(object):
    # 参数设置类，包含一些相关参数
    def __init__(self):
        self.vocab = pkl.load(open('data/generate_pkl/vocab.pkl', 'rb'))  # 读取词表
        self.train_data = pkl.load(open('data/generate_pkl/train_data.pkl', 'rb'))  # 读取训练数据
        self.target = pkl.load(open('data/generate_pkl/target.pkl', 'rb'))  # 读取标签

        self.learning_rate = 0.0015  # 学习率
        self.epoch = 4  # epoch次数
        self.dropout = 0.6 # dropout
        self.max_len = 32

        self.output_size = 4
        self.embed_dim = 128
        self.hidden_dim = 64
        self.hout1 = 32
        self.hout2 = 64

        self.num_layers = 2 # 测试双层LSTM神经网络

        print("---------创建参数类完成---------")
        print("# 词表大小：", len(self.vocab))
        print("# 训练数据大小：", len(self.train_data))
        print("# 标签大小：", len(self.target))
        print("# 输出大小：", self.output_size)
        print("# 嵌入层大小：", self.embed_dim)
        print("# 隐藏层大小：", self.hidden_dim)
        print("# 第一层输出大小：", self.hout1)
        print("# 第二层输出大小：", self.hout2)
        print("# 学习率：", self.learning_rate)
        print("# epoch次数：", self.epoch)
        print("# dropout层：", self.dropout)
        print("# 每个LSTM中循环次数：", self.num_layers)
        print("-------------------------------")

In [10]:
class BiLSTM_Model(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size):
        super(BiLSTM_Model, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size)
        self.bilstm1 = nn.LSTM(embedding_size, hidden_size, bidirectional=True, batch_first=True)
        self.dropout1 = nn.Dropout(0.6)
        self.bilstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
        self.dropout2 = nn.Dropout(0.6)
        self.fc = nn.Linear(hidden_size * 2, 5)
        
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.bilstm1(x)
        x = self.dropout1(x)
        x, _ = self.bilstm2(x)
        x = self.dropout2(x)
        x = self.fc(x)
        return x

```python
def model_eval(model_out, true_label):
    # confusion_matrix = torch.zeros([2, 2], dtype=torch.long)
    predict_label = torch.argmax(model_out, 1)
    accuracy = []
    precision = []
    recall = []
    f_1 = []
    for l in range(4):
        tp_num, fp_num, fn_num, tn_num = 0, 0, 0, 0
        for p, t in zip(predict_label, true_label):
            if p == t and t == l:
                tp_num += 1
            if p == l and t != l:
                fp_num += 1
            if p != l and p != t:
                fn_num += 1
            if p != l and p == t:
                tn_num += 1
        accuracy.append((tp_num + tn_num) / (tp_num + tn_num + fp_num + fn_num))
        try:
            prec = tp_num / (tp_num + fp_num)
        except:
            prec = 0.0
        try:
            rec = tp_num / (tp_num + fn_num)
        except:
            rec = 0
        precision.append(prec)
        recall.append(rec)
        if prec == 0 and rec == 0:
            f_1.append(0)
        else:
            f_1.append((2 * prec * rec) / (prec + rec))
    ave_acc = torch.tensor(accuracy, dtype=torch.float).mean()
    ave_prec = torch.tensor(precision, dtype=torch.float).mean()
    ave_rec = torch.tensor(recall, dtype=torch.float).mean()
    ave_f1 = torch.tensor(f_1, dtype=torch.float).mean()
    return ave_acc, ave_prec, ave_rec, ave_f1
```


#建议注释掉这一个函数，因为test_  有可能会和内置函数重名了，改个名字也行~
def test_Split(model_):
    text = '在一九九八年来临之际，我十分高兴地通过中央人民广播电台、中国国际广播电台和中央电视台，向全国各族人民，向香港特别行政区同胞、澳门和台湾同胞、海外侨胞，向世界各国的朋友们，致以诚挚的问候和良好的祝愿！'
    hang_ = []
    for wd in text:
        # print(wd) # test
        hang_.append(Config().vocab[wd])
    test_tensor = torch.tensor(hang_, dtype=torch.long)
    res = model_(test_tensor)
    res = res.detach().numpy()
    [print(np.argmax(r), end=",") for r in res]
    print("\n")
    print(res)

In [11]:
#设置参数的起点
torch.manual_seed(1)
config = Config()
voc_size = len(config.vocab)
print("voc_size:", voc_size)

---------创建参数类完成---------
# 词表大小： 4665
# 训练数据大小： 159671
# 标签大小： 159671
# 输出大小： 4
# 嵌入层大小： 128
# 隐藏层大小： 64
# 第一层输出大小： 32
# 第二层输出大小： 64
# 学习率： 0.0015
# epoch次数： 4
# dropout层： 0.6
# 每个LSTM中循环次数： 2
-------------------------------
voc_size: 4665


In [12]:
train_data_list = []
for lin in config.train_data:
    hang = []
    for word in lin:
        hang.append(config.vocab[word])
    # 如果句子长度小于max_len，则在句子后面补0，使得句子长度等于max_len
    if len(hang) < config.max_len:
        hang.extend([0] * (config.max_len - len(hang)))
    # 如果句子长度大于max_len，则截取句子，使得句子长度等于max_len
    else:
        hang = hang[:config.max_len]
    
    #将列表类型转变为张量类型
    train_data_list.append(torch.tensor(hang, dtype=torch.long))
print(train_data_list[:1])
print(len(train_data_list))

[tensor([1259,  181,  644,  537,  557,  430,    1,   33,  183,  378,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])]
159671


In [13]:
target_dict = {'B': 0, # begin
               'M': 1, # middle
               'E': 2, # end
               'S': 3, # single
               'N': 4} # null

In [14]:
# 将标签转换为one-hot编码
target_list = []
for lin in config.target:
    hang = []
    for word in lin:
        hang.append(target_dict[word])
    target_list.append(torch.tensor(hang, dtype=torch.long))
print(target_list[:1])
print(len(target_list))

[tensor([0, 2, 0, 2, 0, 2, 3, 3, 0, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
        4, 4, 4, 4, 4, 4, 4, 4])]
159671


In [15]:
print(len(train_data_list), len(target_list))

159671 159671


In [16]:
print(train_data_list[0].shape, target_list[0].shape)

torch.Size([32]) torch.Size([32])


In [17]:
model = BiLSTM_Model(voc_size + 1, config.embed_dim, config.hidden_dim)
print(model)

# 定义损失函数和优化器
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)

BiLSTM_Model(
  (embedding): Embedding(4666, 128)
  (bilstm1): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.6, inplace=False)
  (bilstm2): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.6, inplace=False)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)


In [20]:
# 将数据和模型上传到GPU进行计算
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
model.to(device)
train_data_list = [i.to(device) for i in train_data_list]
target_list = [i.to(device) for i in target_list]

cuda:0


In [19]:
# 开始训练
for epoch in range(config.epoch):
    print("Epoch:", epoch + 1)
    for i in tqdm(range(len(train_data_list))):
        model.zero_grad()
        input_data = train_data_list[i].view(1, -1)
        target = target_list[i].view(-1)
        output = model(input_data)
        loss = loss_function(output.view(-1, 5), target)
        loss.backward()
        optimizer.step()
    print("Loss:", loss.item())
    # # 打印这个epoch的准确率
    # with torch.no_grad():
    #     right_num = 0
    #     all_num = 0
    #     for i in tqdm(range(len(train_data_list))):
    #         input_data = train_data_list[i].view(1, -1)
    #         target = target_list[i].view(-1)
    #         output = model(input_data)
    #         output = torch.argmax(output.view(-1, 5), dim=1)
    #         for j in range(len(output)):
    #             if output[j] == target[j]:
    #                 right_num += 1
    #             all_num += 1
    #     print("Accuracy:", right_num / all_num)

Epoch: 1


100%|██████████| 159671/159671 [09:22<00:00, 283.75it/s]


Loss: 0.033838141709566116
Epoch: 2


100%|██████████| 159671/159671 [09:30<00:00, 279.99it/s]


Loss: 0.056796845048666
Epoch: 3


100%|██████████| 159671/159671 [09:45<00:00, 272.75it/s]


Loss: 0.02790956199169159
Epoch: 4


100%|██████████| 159671/159671 [10:43<00:00, 248.16it/s]

Loss: 0.053907278925180435





In [21]:
# 保存模型
torch.save(model.state_dict(), 'model/bilstm_model.pkl')

# 加载模型
model.load_state_dict(torch.load('model/bilstm_model.pkl'))
model.eval()

BiLSTM_Model(
  (embedding): Embedding(4666, 128)
  (bilstm1): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout1): Dropout(p=0.6, inplace=False)
  (bilstm2): LSTM(128, 64, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.6, inplace=False)
  (fc): Linear(in_features=128, out_features=5, bias=True)
)

In [22]:
# 测试模型
test_data = ["我将坚持解放思想", "抓住机遇和挑战", "中国人民将满怀信心地开创新的业绩"]
test_data = list(test_data)
print(test_data)

['我将坚持解放思想', '抓住机遇和挑战', '中国人民将满怀信心地开创新的业绩']


In [23]:
# 将测试数据转换为张量
test_data_list = []
for lin in test_data:
    hang = []
    for word in lin:
        if word in config.vocab:
            hang.append(config.vocab[word])
        else:
            hang.append(0)
    if len(hang) < config.max_len:
        hang.extend([0] * (config.max_len - len(hang)))
    else:
        hang = hang[:config.max_len]
    test_data_list.append(torch.tensor(hang, dtype=torch.long))
print(test_data_list)

[tensor([ 45, 137, 401, 200, 208, 258, 350, 306,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0]), tensor([ 590,  410,  104,  931,   10, 1145,  240,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0]), tensor([   5,    3,    6,   28,  137,  537, 1009,  273,  140,   21,   60,  330,
          33,    1,   18,  869,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])]


In [24]:
# 将测试数据上传到GPU
test_data_list = [i.to(device) for i in test_data_list]

In [26]:
# 开始测试
for i in range(len(test_data_list)):
    input_data = test_data_list[i].view(1, -1)
    output = model(input_data)
    print(output)
    output = torch.argmax(output, dim=-1)
    print("".join(test_data[i]))
    print(output)
    print("".join([['B', 'M', 'E', 'S', 'N'][i] for i in output.tolist()[0]]))

tensor([[[-16.8229, -20.6023, -17.7427, -10.7415, -55.7657],
         [-14.9595, -16.3279, -13.3404,  -8.5430, -49.6099],
         [ -4.6078, -10.0039, -11.0393,  -8.2218, -64.9451],
         [ -9.3567,  -7.0702,  -2.3992,  -6.4332, -54.9583],
         [ -5.0555,  -9.6652, -10.4883,  -8.1819, -63.8043],
         [ -3.1552,  -0.4953,   0.6513,  -2.0261, -36.6928],
         [  0.8011,  -0.3517,  -2.3397,  -2.0062, -42.6504],
         [-14.6302, -13.3675,  -5.7910,  -9.0262, -54.6771],
         [ -3.2042,  -4.9487,  -1.8632,  -0.9067,  41.4675],
         [ -1.5748,  -4.6515,  -1.6616,   0.3782,  53.3997],
         [  0.3558,  -4.0154,  -0.6866,   1.8483,  58.4570],
         [  1.5319,  -3.6111,   0.1255,   2.6160,  61.3798],
         [  1.5740,  -3.7074,   0.3166,   2.5654,  61.5844],
         [  1.6607,  -3.5812,   0.4877,   2.6384,  61.9165],
         [  1.6827,  -3.5536,   0.5278,   2.6602,  62.0225],
         [  1.6902,  -3.5449,   0.5403,   2.6679,  62.0599],
         [  1.6937,  -3.