In [16]:
import pandas as pd
import numpy as np
import json,time
from tqdm import tqdm
from sklearn.metrics import accuracy_score,classification_report
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset,DataLoader,RandomSampler,SequentialSampler
from transformers import BertModel,BertConfig,BertTokenizer,AdamW,get_cosine_schedule_with_warmup
import warnings
warnings.filterwarnings('ignore')
bert_path = 'D:/Code/bert-first-test/bert-model/'
tokenizer = BertTokenizer.from_pretrained(bert_path)
#预处理数据集
input_ids,input_masks,input_types, = [],[],[]
labels = []
maxlen = 30

with open('D:/Code/bert-first-test/news.txt','r',encoding='utf-8') as f :
    data = f.readlines()
    for i,line in tqdm(enumerate(data)):
        y,title = line.split(sep=' ',maxsplit=1) 
        
        encode_dict = tokenizer.encode_plus(text = title,add_special_tokens=True , max_length = maxlen,padding = "max_length" , truncation = True )
        
        
        input_ids.append(encode_dict['input_ids']) #在词典中的映射
        input_masks.append(encode_dict['attention_mask']) #指定对哪些词进行self-Attention操作
        input_types.append(encode_dict['token_type_ids']) #句子编号 0 1
        
       # tokens = tokenizer.tokenize(title)
       # print(encode_dict)
        labels.append(int(y))

input_ids,input_types,input_masks = np.array(input_ids),np.array(input_types),np.array(input_masks)
labels = np.array(labels)
print(input_ids.shape,input_types.shape,input_masks.shape,labels.shape)

idxes = np.arange(input_ids.shape[0]) #文本数量
np.random.seed(2022)
np.random.shuffle(idxes) #打乱序号
input_ids_train, input_ids_valid,input_ids_test = input_ids[idxes[:620000]],input_ids[idxes[620000:700000]],input_ids[idxes[700000:]]
input_masks_train, input_masks_valid,input_masks_test = input_masks[idxes[:620000]],input_masks[idxes[620000:700000]],input_masks[idxes[700000:]]
input_type_train, input_type_valid,input_type_test = input_types[idxes[:620000]],input_types[idxes[620000:700000]],input_types[idxes[700000:]]
y_train,y_valid,y_test = labels[idxes[:620000]],labels[idxes[620000:700000]],labels[idxes[700000:]]
print(input_ids_train.shape,y_train.shape)
print(idxes[:10])


BATCH_SIZE = 64
#训练集
train_data = TensorDataset(torch.LongTensor(input_ids_train),
                           torch.LongTensor(input_masks_train),
                           torch.LongTensor(input_type_train),
                           torch.LongTensor(y_train)
                          )
train_sampler = RandomSampler(train_data)# 随机采样训练集
train_loader = DataLoader(train_data,sampler = train_sampler,batch_size = BATCH_SIZE)

#验证集
valid_data = TensorDataset(torch.LongTensor(input_ids_valid),
                           torch.LongTensor(input_masks_valid),
                           torch.LongTensor(input_type_valid),
                           torch.LongTensor(y_valid)                                              
                          )
valid_sampler = SequentialSampler(valid_data) #顺序采样
valid_loader = DataLoader(valid_data, sampler = valid_sampler ,batch_size = BATCH_SIZE)

#测试集
test_data = TensorDataset(
                           torch.LongTensor(input_ids_test),
                           torch.LongTensor(input_masks_test),
                           torch.LongTensor(input_type_test),
                           
)
test_sampler = SequentialSampler(test_data)
test_loader = DataLoader(test_data, sampler = test_sampler ,batch_size = BATCH_SIZE)


#定义bert 模型
from turtle import forward
import torch
import torch.nn as nn


class Bert_Model(nn.Module):
    def __init__(self,bert_path,classes = 10) :
        super(Bert_Model,self).__init__()
        self.config = BertConfig.from_pretrained(bert_path) #导入模型超参数
        self.bert = BertModel.from_pretrained(bert_path)  #加载预训练模型权重
        self.fc = nn.Linear(self.config.hidden_size,classes) #多分类 接线性层 可以加一些层数


    def forward(self,input_ids,attention_mask = None,token_type_ids = None) :
        outputs = self.bert(input_ids,attention_mask,token_type_ids)
        out_pool = outputs[1]  #池化后的输出  [batch_size,config_size]
        logit = self.fc(out_pool) #[batch_size,classes]
        return logit

#实例化bert模型
def get_parameter_number(model):
    #打印模型参数
    total_num = sum(p.numel() for p in model.parameters())
    trainable_num = sum(p.numel() for p in model.parameters() if p.requires_grad)
    return 'Total parameters:{},Trainable parameters:{}'.format(total_num,trainable_num)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
EPOCHS = 20
model = Bert_Model(bert_path).to(DEVICE)
print(get_parameter_number(model))

#定义优化器
optimizer = AdamW(model.parameters(),lr = 2e-5,weight_decay= 1e-4) #Adam优化器
scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=len(train_loader),
                                            num_training_steps = EPOCHS*len(train_loader))

                                            #学习率先线性warmup一个epoch，然后cosine式下降
                                            #必须加warmup  不然可能不会收敛
                                            
#评估模型 在验证集上
from statistics import mode


def evaluate(model,data_loader,device):
    model.eval()
    val_true,val_pred = [],[]
    with torch.no_grad():
        for idx,(ids,att,tpe,y) in (enumerate(data_loader)):
            y_pred = model(ids.to(device),att.to(device),tpe.to(device))
            y_pred = torch.argmax(y_pred,dim = 1 ).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
            val_true.extend(y.squeeze().cpu().numpy().tolist())

    return accuracy_score(val_true,val_pred)

#测试集没有标签，需要预测提交

def predict(model,data_loader,device):
    model.eval()
    val_pred = []
    with torch.no_grad():
        for idx,(ids,att,tpe) in tqdm(enumerate(data_loader)):
            y_pred = model(ids.to(device),att.to(device),tpe.to(device))
            y_pred = torch.argmax(y_pred,dim = 1 ).detach().cpu().numpy().tolist()
            val_pred.extend(y_pred)
    return val_pred


def train_and_eval(model,train_loader,valid_loader,optimizer,scheduler,device,epoch):

    best_acc = 0.0
    patience = 0
    criterion = nn.CrossEntropyLoss()
    for i in range(epoch):
        '''训练模型'''
        start = time.time()
        model.train()
        print(" Running training epoch {}".format(i+1))
        train_loss_sum = 0.0
        for idx,(ids,att,tpe,y)in enumerate(train_loader):
            ids,att,tpe,y =ids.to(device),att.to(device),tpe.to(device),y.to(device)
            y_pred = model(ids,att,tpe)
            loss = criterion(y_pred ,y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss_sum += loss.item()
            if(idx + 1)%(len(train_loader)//5) == 0:
                print("Epoch {:04d} | Step {:04d}/{:04d} | Loss{:.4f} | Time {:.4f}".format(
                    i+1,idx+1,len(train_loader),train_loss_sum/(idx+1),time.time()-start
                ))

     #验证模型           
        model.eval()
        acc = evaluate(model,valid_loader,device)
        
        if acc> best_acc:
            best_acc = acc
            torch.save(model.state_dict(),"best_bert_model.pth")
        print("current acc is {:.4f},best acc is {:.4f}".format(acc,best_acc))
        print("time costed = {}s \n".format(round(time.time()-start,5)))



train_and_eval(model,train_loader,valid_loader,optimizer,scheduler,DEVICE,EPOCHS)     


778955it [02:32, 5107.09it/s]


(778955, 30) (778955, 30) (778955, 30) (778955,)
(620000, 30) (620000,)
[628746 745702 605667 118057 179863  36038 769851 133383 510526  34092]


Some weights of the model checkpoint at D:/Code/bert-first-test/bert-model/ were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Total parameters:102275338,Trainable parameters:102275338
 Running training epoch 1
Epoch 0001 | Step 1937/9688 | Loss0.8135 | Time 586.6170
Epoch 0001 | Step 3874/9688 | Loss0.5141 | Time 1179.1316
Epoch 0001 | Step 5811/9688 | Loss0.4036 | Time 1767.4159
Epoch 0001 | Step 7748/9688 | Loss0.3445 | Time 2355.9335
Epoch 0001 | Step 9685/9688 | Loss0.3077 | Time 2925.7020
current acc is 0.9490,best acc is 0.9490
time costed = 3033.80965s 

 Running training epoch 2
Epoch 0002 | Step 1937/9688 | Loss0.1382 | Time 569.8791
Epoch 0002 | Step 3874/9688 | Loss0.1385 | Time 1139.6238
Epoch 0002 | Step 5811/9688 | Loss0.1370 | Time 1711.1569
Epoch 0002 | Step 7748/9688 | Loss0.1358 | Time 2280.8896
Epoch 0002 | Step 9685/9688 | Loss0.1346 | Time 2850.6463
current acc is 0.9577,best acc is 0.9577
time costed = 2958.86644s 

 Running training epoch 3
Epoch 0003 | Step 1937/9688 | Loss0.0981 | Time 569.7505
Epoch 0003 | Step 3874/9688 | Loss0.0997 | Time 1139.4834
Epoch 0003 | Step 5811/9688 | Los