# 0.导包

In [1]:
import pandas as pd
import numpy as np
import json,time
from  tqdm import tqdm
from sklearn.metrics import accuracy_score,classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
from transformers import BertModel,BertConfig,BertTokenizer,AdamW,get_cosine_schedule_with_warmup

#参数
bert_path = 'bert_model/'   #预训练模型的位置
tokenizer = BertTokenizer.from_pretrained(bert_path)   #初始化分词器
max_len = 30     #数据阻断长度
BATCH_SIZE = 64
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 2

# 1.预处理数据

In [2]:
#1.1处理数据成input_ids,token_type_ids,attention_mask,label
def dataSet(data_path):
    input_ids,token_type_ids,attention_mask = [],[],[]
    labels = []
    with open(data_path,encoding='utf-8') as f:
        for i,line in tqdm(enumerate(f)):
            title,y = line.strip().split('\t')   #删除所有的空格，用\t分割数据集和标签
            #调用tokenizer转换成bert需要的数据格式
            encode_dict = tokenizer.encode_plus(text=title,max_length=max_len,padding='max_length',truncation=True)
            #分别获取三个值  目前值的类型为list
            input_ids.append(encode_dict['input_ids'])
            token_type_ids.append(encode_dict['token_type_ids'])
            attention_mask.append(encode_dict['attention_mask'])
            labels.append(int(y))
    #list转化成tensor格式
    input_ids,token_type_ids,attention_mask = torch.tensor(input_ids),torch.tensor(token_type_ids),torch.tensor(attention_mask)
    return input_ids,token_type_ids,attention_mask,labels

#1.2 dataloder批量处理
def dataLoader(input_ids,token_type_ids,attention_mask,labels):
    #tensor数据整合
    labels = torch.tensor(labels)
    data = TensorDataset(input_ids,token_type_ids,attention_mask,labels)
    loader = DataLoader(data,batch_size=BATCH_SIZE,shuffle=True)    #shuffle打乱每行数据的顺序
    return loader

#1.3实例化函数
#训练集带label
input_ids_train,token_type_ids_train,attention_mask_train,labels_train = dataSet('data/train.txt')
train_loader = dataLoader(input_ids_train,token_type_ids_train,attention_mask_train,labels_train)
#验证集带label
input_ids_dev,token_type_ids_dev,attention_mask_dev,labels_dev = dataSet('data/dev.txt')
dev_loader = dataLoader(input_ids_dev,token_type_ids_dev,attention_mask_dev,labels_dev)
#测试集 没有的话label放到dataloader
# input_ids_test,token_type_ids_test,attention_mask_test,labels_test = dataSet('data/test.txt')
# data = TensorDataset(input_ids_test,token_type_ids_test,attention_mask_test)
# sample = RandomSampler(data) #随机采样
# test_loader = DataLoader(data,sampler=sample,batch_size=BATCH_SIZE)
#测试集
input_ids_test,token_type_ids_test,attention_mask_test,labels_test = dataSet('data/test.txt')
test_loader = dataLoader(input_ids_test,token_type_ids_test,attention_mask_test,labels_test)
#得到后续用的数据为train_loader,dev_loader,test_loader

180000it [01:58, 1524.94it/s]
10000it [00:06, 1549.68it/s]
10000it [00:06, 1548.40it/s]


# 2.定义bert模型

In [3]:
class Bert_Model(nn.Module):
    def __init__(self,bert_path,classes=10):
        super(Bert_Model,self).__init__()
        self.config = BertConfig.from_pretrained(bert_path)
        self.bert = BertModel.from_pretrained(bert_path)
        for param in self.bert.parameters():
            param.requires_grad=True
        self.fc = nn.Linear(self.config.hidden_size,classes)  #直接分类
    def forward(self,input_ids,token_type_ids,attention_mask):
        output = self.bert(input_ids,token_type_ids,attention_mask)[1]  #池化后的输出,是向量
        logit = self.fc(output)    #全连接层,概率矩阵
        return logit

#实例化bert模型
model = Bert_Model(bert_path).to(DEVICE)

# 3.定义优化器和线性学习率

In [4]:
#优化器
optimizer = AdamW(model.parameters(),lr=2e-5,weight_decay=1e-4)  #使用Adam优化器
#设置学习率
schedule = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=len(train_loader),num_training_steps=EPOCHS*len(test_loader))

# 4. 定义训练函数和验证测试函数

In [5]:
#在验证集上评估模型性能的函数
def evaluate(model,data_loader,device):
    model.eval()   #防止模型训练改变权值
    val_true,val_pred = [],[]
    with torch.no_grad():     #计算的结构在计算图中,可以进行梯度反转等操作
        for idx,(ids,tpe,att,y) in enumerate(data_loader): #得到的y要转换一下数据格式
            y_pred = model(ids.to(device),tpe.to(device),att.to(device))  #此时得到的是概率矩阵
            y_pred = torch.argmax(y_pred,dim=1).detach().cpu().numpy().tolist()  #将概率矩阵转换成标签并变成list类型
            val_pred.extend(y_pred)   #将标签值放入列表
            val_true.extend(y.squeeze().cpu().numpy().tolist())   #将真实标签转换成list放在列表中
    
    return accuracy_score(val_true,val_pred)
#如果是比赛没有labels_test，那么这个函数for里面没有y，输出没有test_true，处理数据的时候没有labels_test放到dataloader里
def predict(model,data_loader,device):
    model.eval()
    test_pred,test_true = [],[]
    with torch.no_grad():
        for idx,(ids,tpe,att,y) in enumerate(data_loader):
            y_pred = model(ids.to(device),tpe.to(device),att.to(device))   #得到概率矩阵
            y_pred = torch.argmax(y_pred,dim=1).detach().cpu().numpy().tolist()  #将概率矩阵转化成标签值
            test_pred.extend(y_pred)
            test_true.extend(y.squeeze().cpu().numpy().tolist())
    return test_pred,test_true
#训练函数
def train_and_eval(model,train_loader,valid_loader,optimizer,schedule,device,epoch):
    best_acc = 0.0
    patience = 0
    criterion = nn.CrossEntropyLoss()       #损失函数
    for i in range(epoch):
        start = time.time()
        model.train()   #开始训练
        print("***************我是狗Running training epoch{}************".format(i+1))
        train_loss_sum = 0.0
        for idx,(ids,tpe,att,y) in enumerate(train_loader):
            ids,tpe,att,y = ids.to(device),tpe.to(device),att.to(device),y.to(device)
            y_pred = model(ids,tpe,att)   #加载模型获得概率矩阵
            loss = criterion(y_pred,y)    #计算损失
            optimizer.zero_grad()         #梯度清零
            loss.backward()               #反向传播
            optimizer.step()              #更新优化参数
            schedule.step()               #更新学习率
            train_loss_sum += loss.item()
            #只打印五次结果
            if(idx+1)%(len(train_loader)//5)==0:
                print("Epoch {:04d} | Step {:04d}/{:04d} | Loss {:.4f} | Time {:.4f}".format(
                i+1,idx+1,len(train_loader),train_loss_sum/(idx+1),time.time()-start))
        #每一次epoch输出一个准确率
        model.eval()
        acc = evaluate(model,valid_loader,device)     #验证模型的性能
        if acc > best_acc :
            best_acc = acc
            torch.save(model.state_dict(),"best_bert_model.pth")    #保存最好的模型
        print("current acc is {:.4f},best acc is {:.4f}".format(acc,best_acc))
        print("time costed = {}s \n".format(round(time.time()-start,5)))

# 5.开始训练

In [6]:
train_and_eval(model,train_loader,dev_loader,optimizer,schedule,DEVICE,EPOCHS)

***************我是狗Running training epoch1************
Epoch 0001 | Step 0562/2813 | Loss 1.2805 | Time 87.2074
Epoch 0001 | Step 1124/2813 | Loss 0.8091 | Time 173.7297
Epoch 0001 | Step 1686/2813 | Loss 0.6249 | Time 260.2511
Epoch 0001 | Step 2248/2813 | Loss 0.5288 | Time 346.5547
Epoch 0001 | Step 2810/2813 | Loss 0.4663 | Time 444.3679
current acc is 0.9337,best acc is 0.9337
time costed = 454.10747s 

***************我是狗Running training epoch2************
Epoch 0002 | Step 0562/2813 | Loss 0.1625 | Time 86.6217
Epoch 0002 | Step 1124/2813 | Loss 0.1633 | Time 173.0160
Epoch 0002 | Step 1686/2813 | Loss 0.1606 | Time 259.7801
Epoch 0002 | Step 2248/2813 | Loss 0.1590 | Time 360.3851
Epoch 0002 | Step 2810/2813 | Loss 0.1581 | Time 459.5913
current acc is 0.9418,best acc is 0.9418
time costed = 469.2521s 



# 6.加载最优模型进行测试

In [7]:
model.load_state_dict(torch.load("best_bert_model.pth"))
#得到预测标签和真实标签
test_pred,test_true= predict(model,test_loader,DEVICE)
#输出测试机的准确率
print("\n Test Accuracy = {} \n ".format(accuracy_score(test_true,test_pred)))
#打印各项验证指标
print(classification_report(test_true,test_pred,digits=4))
print(test_pred[:10])
print('------------------')
print(test_true[:10])


 Test Accuracy = 0.9492 
 
              precision    recall  f1-score   support

           0     0.9358    0.9470    0.9414      1000
           1     0.9427    0.9710    0.9567      1000
           2     0.9294    0.8950    0.9119      1000
           3     0.9691    0.9710    0.9700      1000
           4     0.9215    0.9270    0.9242      1000
           5     0.9354    0.9560    0.9456      1000
           6     0.9315    0.9240    0.9277      1000
           7     0.9861    0.9910    0.9885      1000
           8     0.9825    0.9540    0.9680      1000
           9     0.9589    0.9560    0.9574      1000

    accuracy                         0.9492     10000
   macro avg     0.9493    0.9492    0.9491     10000
weighted avg     0.9493    0.9492    0.9491     10000

[1, 9, 4, 3, 3, 6, 3, 5, 7, 0]
------------------
[1, 9, 4, 3, 3, 6, 3, 5, 7, 0]
