In [22]:
import pandas as pd
# from pyhanlp import *
import torch
import numpy as np
from transformers import BertTokenizer,BertForSequenceClassification,AdamW
from torch.utils.data import TensorDataset,RandomSampler,DataLoader
from torch.utils.data.sampler import SequentialSampler

In [5]:
# 加载数据集
train_dataset = 'data/train.csv'
test_dataset = 'data/dev_id.csv'
        
def DataPrepare(file_name):
    data = pd.read_csv(file_name)
    data1 = pd.concat([data.question1,data.category],axis=1,keys=['question','label'])
    data2 = pd.concat([data.question2,data.category],axis=1,keys=['question','label'])
    data = pd.concat([data1,data2],axis=0,ignore_index=True)
    def labels_ch(x):
        if x == 'hypertension':
            x = 0
        if x == 'hepatitis':
            x = 1
        if x =='breast_cancer':
            x = 2
        if x=='aids':
            x = 3
        if x =='diabetes':
            x = 4
        return x
    data.label = [labels_ch(la) for la in data.label]
    return data

train_dataset = DataPrepare(train_dataset)
test_dataset = DataPrepare(test_dataset)

In [20]:
def data2feature(data):
    # Load pretrained model/tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',unk_token='[UNK]')
    # tokenizer = BertTokenizer.from_pretrained('./directory/save/')  # re-load

    # Encode text
    input_ids = {}
    max_seq_length = 0
    for i,sen in enumerate(data):
        # Tokenized input,添加[CLS], [SEP]，并将 token 转为 vocabulary 索引
        input_ids[i] = tokenizer.encode(sen, add_special_tokens=True,max_length=20)  
        if max_seq_length < len(input_ids[i]): max_seq_length = len(input_ids[i])
    # unsqueeze(0)   
    print("max_seq_length:"+str(max_seq_length)+"(max_seq_length训练集最大值75),当前修改为20")
    max_seq_length = 20
    # Zero-pad up to the sequence length.
    def padding(input_ids,max_seq_length):
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            #  input_mask.append(0)
            #  segment_ids.append(0)
        return input_ids
    input = []
    for i,_ in enumerate(input_ids):
        input_ids[i] = padding(input_ids[i],max_seq_length)
        input.append(input_ids[i])
    input_ids = torch.LongTensor(input)
    # tokenizer.save_pretrained('./directory/save/')  # save
    # tokenizer = BertTokenizer.from_pretrained('./directory/save/')  # re-load
    return input_ids 

#训练集
train_inputs = data2feature(train_dataset.question)[0:36]
train_labels = torch.tensor(train_dataset.label).reshape(-1,1)[0:36]  # Batch size 1

#测试集
test_inputs = data2feature(test_dataset.question)[0:36]
test_labels = torch.tensor(test_dataset.label).reshape(-1,1)[0:36]  # Batch size 1

max_seq_length:20(max_seq_length训练集最大值75),当前修改为20
max_seq_length:20(max_seq_length训练集最大值75),当前修改为20


In [7]:
# # 因为取了两条数据，数据中label未按顺序填充，修改label
# train_labels[0][0]=0
# train_labels[1][0]=1
# train_labels[2][0]=2

In [8]:
#生成dataloader
batch_size = 5
train_data = TensorDataset(train_inputs, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_labels)
testtest_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
# print("train_inputs.shape()",train_inputs.shape)  # train_inputs.shape() torch.Size([3, 20])
# print("train_labels.shape()",train_labels.shape)  # train_labels.shape() torch.Size([3, 1])

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained('bert-base-chinese',num_labels=5).to(device)

In [10]:
outputs = model(train_inputs, labels=train_labels)
loss, logits = outputs[:5] 

In [11]:
logits

tensor([[ 0.2837, -0.3541,  0.1774,  0.8675,  0.2638],
        [ 0.0450, -0.4117,  0.5731,  0.7860,  0.4932],
        [ 0.3496, -0.1941,  0.3044,  0.8262, -0.0797],
        [ 0.1567, -0.3271,  0.2776,  0.8463,  0.2944],
        [ 0.1246, -0.4623,  0.5355,  0.6910,  0.3097],
        [ 0.2761, -0.5173,  0.2268,  0.4589,  0.4879],
        [ 0.2309, -0.3387,  0.2222,  0.5878,  0.2339],
        [ 0.1289, -0.2991,  0.2546,  0.6653,  0.2744],
        [ 0.2402, -0.4049,  0.3141,  0.7018,  0.3727],
        [-0.0131, -0.4350,  0.5031,  0.7122,  0.3124],
        [ 0.2771, -0.4721,  0.0620,  0.4655,  0.5370],
        [ 0.3839, -0.4130, -0.0105,  0.5837,  0.2695],
        [ 0.2063, -0.4219,  0.1917,  0.6721,  0.2647],
        [ 0.0145, -0.3661,  0.4515,  0.8103,  0.4419],
        [ 0.2349, -0.4967,  0.3600,  0.8605,  0.1619],
        [ 0.0855, -0.3589,  0.3730,  0.8709,  0.4646],
        [ 0.0685, -0.3638,  0.2832,  0.8250,  0.4209],
        [ 0.2662, -0.5054,  0.0727,  0.6761,  0.3440],
        [ 

In [23]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

optimizer = AdamW(params=optimizer_grouped_parameters, 
                              lr=2e-5,correct_bias=False)
# optimizer = BertAdam(optimizer_grouped_parameters,
#                      lr=2e-5,
#                      warmup=.1)



In [13]:
#定义一个计算准确率的函数
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [24]:
#训练开始
train_loss_set = []#可以将loss加入到列表中，后期画图使用
epochs = 10
for epoch in range(epochs):
    print("Epoch: {}".format(epoch))
    #训练开始
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(train_dataloader):
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        optimizer.zero_grad()
        #取第一个位置，BertForSequenceClassification第一个位置是Loss，第二个位置是[CLS]的logits
        loss = model(b_input_ids, token_type_ids=None, labels=b_labels)[0]
        train_loss_set.append(loss.item())
        loss.backward()
        optimizer.step()

        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1
    print("Train loss: {}".format(tr_loss / nb_tr_steps))
    #模型评估
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for batch in test_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids, token_type_ids=None)[0]
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
    print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps))

Epoch: 0
Train loss: 0.10502016427926719
Test Accuracy: 0.7999999999999999
Epoch: 1
Train loss: 0.1546789885032922
Test Accuracy: 0.95
Epoch: 2
Train loss: 0.07703826483339071
Test Accuracy: 0.9249999999999999
Epoch: 3
Train loss: 0.01590970327379182
Test Accuracy: 0.9249999999999999
Epoch: 4
Train loss: 0.007465500006219372
Test Accuracy: 0.8999999999999999
Epoch: 5
Train loss: 0.0039281596400542185
Test Accuracy: 0.8999999999999999
Epoch: 6
Train loss: 0.0024813308991724625
Test Accuracy: 0.8999999999999999
Epoch: 7
Train loss: 0.002139362259185873
Test Accuracy: 0.8999999999999999
Epoch: 8
Train loss: 0.001958146160177421
Test Accuracy: 0.8999999999999999
Epoch: 9
Train loss: 0.0017257512590731494
Test Accuracy: 0.8999999999999999


In [27]:
print("test_inputs[0]:",test_inputs[0].reshape(-1,20).shape) 
print("test_labels[0]:",test_labels[0:2])
logits = model(test_inputs[0:2])

test_inputs[0]: torch.Size([1, 20])
test_labels[0]: tensor([[2],
        [3]])


In [28]:
logits 

(tensor([[-1.6879, -1.6426,  4.6078, -1.7010, -0.7756],
         [-1.0576, -0.4844, -0.8272,  6.0423, -1.0487]],
        grad_fn=<AddmmBackward>),)

In [29]:
# model.save_pretrained('./directory/save/')  # save

#改在训练的时候，判断准确最高的保存 

# model = model_class.from_pretrained('./directory/save/')  # re-load
# tokenizer.save_pretrained('./directory/save/')  # save
# tokenizer = BertTokenizer.from_pretrained('./directory/save/')  # re-load

In [40]:
21%10

1