### 参考
https://pytorch.apachecn.org/docs/1.2/beginner/text_sentiment_ngrams_tutorial.html

In [4]:
import pandas as pd
import csv
import torch
import numpy as np
from transformers import BertTokenizer,BertForSequenceClassification,BertConfig,AdamW
from torch.utils.data import TensorDataset,DataLoader
from pyhanlp import *

In [5]:
bert_pre_model='./model/'                    #词表
# bert_config='C:/Users/Administrator/Downloads/chinese_L-12_H-768_A-12/bert_config.json'       #配置文件    
vocab_path = './model/vocab1.txt'

In [6]:
# 加载数据集
train_dataset = 'data/train.csv'
test_dataset = 'data/dev_id.csv'
        
def DataPrepare(file_name):
    data = pd.read_csv(file_name)
    data1 = pd.concat([data.question1,data.category],axis=1,keys=['question','label'])
    data2 = pd.concat([data.question2,data.category],axis=1,keys=['question','label'])
    data = pd.concat([data1,data2],axis=0,ignore_index=True)
    def labels_ch(x):
        if x == 'hypertension':
            x = 0
        if x == 'hepatitis':
            x = 1
        if x =='breast_cancer':
            x = 2
        if x=='aids':
            x = 3
        if x =='diabetes':
            x = 4
        return x
    data.label = [labels_ch(la) for la in data.label]
    return data

train_dataset = DataPrepare(train_dataset)
test_dataset = DataPrepare(test_dataset)

In [7]:
# 分词
def cut_sentents(sentents):
    HanLP.Config.ShowTermNature = False #不显示词性
    PerceptronSegmenter= JClass('com.hankcs.hanlp.model.perceptron.PerceptronSegmenter')
    segment = PerceptronSegmenter()
    # segment.enablePartOfSpeechTagging(True)   # 激活数字和英文识别
    for i,sen in enumerate(sentents):
        sentents[i] = segment.segment(sen)
        sentents[i] = list(sentents[i])
#         sentents[i].insert(0, '[CLS]')
#         sentents[i].append('[SEP]')
    return sentents

train_tokenized = cut_sentents(train_dataset.question)
test_tokenized = cut_sentents(test_dataset.question)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [8]:
train_tokenized[0]

['艾滋病', '窗口期', '会', '出现', '腹泻', '症状', '吗']

In [9]:
# 把当前数据中分好的词添加到 BERT词表中
sub_voc = []                      
for sen in train_tokenized:
    for voc in sen:
        if voc not in sub_voc: 
            sub_voc.append(voc)
# print(len(sub_voc))  # 7558               
for sen in test_tokenized:
    for voc in sen:
        if voc not in sub_voc: 
            sub_voc.append(voc)
# print(len(sub_voc))  # 9642
# sub_voc # 一维list

In [10]:
# 读取bert词表
vocab = pd.read_csv(vocab_path,sep='\t',header=None, quoting=csv.QUOTE_NONE)
vocab = vocab.values.tolist()

# 把当前数据中分好的词添加到 BERT词表中
for voc in sub_voc:
    if voc not in vocab: 
        vocab.insert(106, [voc])
# type(vocab) # pandas.core.frame.DataFrame
# vocab

In [11]:
tokenizer = BertTokenizer.from_pretrained(bert_pre_model,unk_token='[UNK]')
input_ids = tokenizer.encode(train_tokenized[0])  
input_ids

[9747, 9746, 10475, 9744, 9743, 9742, 11050]

In [12]:
vocab = pd.DataFrame(vocab)
vocab.to_csv("C:/Users/Administrator/Downloads/chinese_L-12_H-768_A-12/vocab.txt",index=False,header=False,sep="\t")

In [13]:
def data2feature(data):
    # Load pretrained model/tokenizer
    tokenizer = BertTokenizer.from_pretrained(bert_pre_model,unk_token='[UNK]')

    # Encode text
    input_ids = {}
    max_seq_length = 0
    for i,sen in enumerate(data):
        # Tokenized input,添加[CLS], [SEP]，并将 token 转为 vocabulary 索引
        input_ids[i] = tokenizer.encode(sen, add_special_tokens=True,max_length=20)  
        if max_seq_length < len(input_ids[i]): max_seq_length = len(input_ids[i])
    # unsqueeze(0)   
    print("max_seq_length:"+str(max_seq_length)+"(max_seq_length训练集最大值51),当前修改为20")
    max_seq_length = 20
    # Zero-pad up to the sequence length.
    def padding(input_ids,max_seq_length):
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            #  input_mask.append(0)
            #  segment_ids.append(0)
        return input_ids
    input = []
    for i,_ in enumerate(input_ids):
        input_ids[i] = padding(input_ids[i],max_seq_length)
        input.append(input_ids[i])
    input_ids = torch.LongTensor(input)
    return input_ids 

#训练集
train_inputs = data2feature(train_tokenized)[0:36]
train_labels = torch.tensor(train_dataset.label).reshape(-1,1)[0:36]  # Batch size 1

#测试集
test_inputs = data2feature(test_tokenized)[0:36]
test_labels = torch.tensor(test_dataset.label).reshape(-1,1)[0:36]  # Batch size 1

max_seq_length:20(max_seq_length训练集最大值51),当前修改为20
max_seq_length:20(max_seq_length训练集最大值51),当前修改为20


In [14]:
train_inputs[0]

tensor([  101,  9747,  9746, 10475,  9744,  9743,  9742, 11050,   102,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [15]:
#生成dataloader
batch_size = 5
train_data = TensorDataset(train_inputs, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_labels)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
# print("train_inputs.shape()",train_inputs.shape)  # train_inputs.shape() torch.Size([3, 20])
# print("train_labels.shape()",train_labels.shape)  # train_labels.shape() torch.Size([3, 1])

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = BertForSequenceClassification.from_pretrained('bert-base-chinese',num_labels=5).to(device)
# modelConfig = BertConfig.from_pretrained('bert-base-uncased/bert_config.json')
model = BertForSequenceClassification.from_pretrained(bert_pre_model,num_labels=5).to(device)

In [17]:
# test
outputs = model(train_inputs, labels=train_labels)
loss, logits = outputs[:5] 

In [18]:
logits

tensor([[ 0.3311, -0.6933, -1.0277, -0.4949,  2.7394],
        [-0.0177, -0.9140, -1.0456, -0.4949,  2.9560],
        [ 0.1435, -0.8184, -0.8906, -0.6996,  2.9112],
        [ 0.0705, -0.7222, -1.1606, -0.7454,  3.0610],
        [ 0.3933, -0.3892, -0.9325, -0.5016,  2.5982],
        [ 0.1312, -0.6197, -1.2588, -0.5065,  2.7341],
        [ 0.1775, -0.4645, -1.2231, -0.5314,  2.5631],
        [-0.1035, -0.7987, -1.1863, -0.7085,  3.0671],
        [ 0.1237, -0.5449, -1.2022, -0.6488,  2.6694],
        [ 0.1547, -0.6624, -1.0702, -0.3292,  2.6155],
        [ 0.1051, -0.8972, -1.1437, -0.5451,  2.9720],
        [ 0.2926, -0.7395, -1.1611, -0.3686,  2.8957],
        [ 0.3396, -0.4224, -1.0556, -0.1200,  2.3630],
        [ 0.4811, -0.6804, -1.3525, -0.0648,  2.3938],
        [ 0.1913, -0.7646, -1.0654, -0.6881,  3.0653],
        [-0.0158, -0.7981, -1.1890, -0.6760,  3.2062],
        [ 0.2654, -0.5463, -1.0700, -0.5062,  2.6519],
        [ 0.1812, -0.8180, -1.1129, -0.5079,  2.9467],
        [ 

In [19]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(params=optimizer_grouped_parameters, 
                              lr=2e-5,correct_bias=False)

In [20]:
#定义一个计算准确率的函数
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
#训练开始
train_loss_set = []#可以将loss加入到列表中，后期画图使用
best_accuracy = 0
epochs = 2000
for epoch in range(epochs):
    if(epoch%100 == 0): print("Epoch: {}".format(epoch))
    #训练开始
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        optimizer.zero_grad()
        #取第一个位置，BertForSequenceClassification第一个位置是Loss，第二个位置是[CLS]的logits
        loss = model(b_input_ids, token_type_ids=None, labels=b_labels)[0]
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    if(epoch%100 == 0):  print("Train loss: {}".format(tr_loss / nb_tr_steps))
    #模型评估
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    accuracy = eval_accuracy / nb_eval_steps
    if(best_accuracy<accuracy):
        best_accuracy = accuracy
        model.save_pretrained('./directory/save/')  # 保存最高的accuracy
        # model = model_class.from_pretrained('./directory/save/')  # re-load
    if(epoch%100 == 0):  print("Test Accuracy: {}".format(accuracy))

Epoch: 0
Train loss: 1.2226341888308525
Test Accuracy: 0.175
Epoch: 100
Train loss: 0.47331657295580953
Test Accuracy: 0.45
Epoch: 200
Train loss: 0.5597378773745731
Test Accuracy: 0.45
Epoch: 300
Train loss: 0.2027156749754795
Test Accuracy: 0.625
Epoch: 400
Train loss: 0.3558836270676693
Test Accuracy: 0.45
Epoch: 500
Train loss: 0.21284987591207027
Test Accuracy: 0.625
Epoch: 600
Train loss: 0.08749916453234619
Test Accuracy: 0.675
Epoch: 700
Train loss: 0.08543706410273444
Test Accuracy: 0.675
Epoch: 800
Train loss: 0.08797759396838956
Test Accuracy: 0.675
Epoch: 900
Train loss: 0.08598874708695803
Test Accuracy: 0.625
Epoch: 1000
Train loss: 0.09064131304512557
Test Accuracy: 0.625
Epoch: 1100
Train loss: 0.07875517905631568
Test Accuracy: 0.675
Epoch: 1200
Train loss: 0.4921140979663505
Test Accuracy: 0.45
Epoch: 1300
Train loss: 0.48169442109792726
Test Accuracy: 0.45
Epoch: 1400
Train loss: 0.4672973936685594
Test Accuracy: 0.45
Epoch: 1500
Train loss: 0.5014086491255512
Test A

In [28]:
# 加载已保存的准确率最高的模型
model = BertForSequenceClassification.from_pretrained('./directory/save/')  # re-load
# logits = model(test_inputs)
#模型评估
model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0
for batch in test_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_labels = batch
    with torch.no_grad():
        logits = model(b_input_ids, token_type_ids=None)[0]
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()
    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
accuracy = eval_accuracy / nb_eval_steps
if(best_accuracy<accuracy):
    best_accuracy = accuracy
    model.save_pretrained('./directory/save/')  # 保存最高的accuracy
    # model = model_class.from_pretrained('./directory/save/')  # re-load
#if(epoch%100 == 0):  
print("Test Accuracy: {}".format(accuracy))

Test Accuracy: 0.7999999999999999
