# BERT实现网购评论的对象分类

In [1]:
import re
import json
import jieba
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib_inline import backend_inline
from sklearn.model_selection import train_test_split
import torch
from torch import nn
from torch.utils.data import Dataset
from transformers import BertTokenizer,BertModel
from torch.optim import Adam

## 数据读取与预处理

In [2]:
df = pd.read_csv("online_shopping_10_cats.csv")[:60000]
df.head()

Unnamed: 0,cat,label,review
0,书籍,1,﻿做父母一定要有刘墉这样的心态，不断地学习，不断地进步，不断地给自己补充新鲜血液，让自己保持...
1,书籍,1,作者真有英国人严谨的风格，提出观点、进行论述论证，尽管本人对物理学了解不深，但是仍然能感受到...
2,书籍,1,作者长篇大论借用详细报告数据处理工作和计算结果支持其新观点。为什么荷兰曾经县有欧洲最高的生产...
3,书籍,1,作者在战几时之前用了＂拥抱＂令人叫绝．日本如果没有战败，就有会有美军的占领，没胡官僚主义的延...
4,书籍,1,作者在少年时即喜阅读，能看出他精读了无数经典，因而他有一个庞大的内心世界。他的作品最难能可贵...


In [3]:
#构建类别与编号的转换字典，并将类别转成编号
class2idx ={'书籍':0, '平板':1, '手机':2, '水果':3, '洗发水':4, '热水器':5, '蒙牛':6, '衣服':7, '计算机':8, '酒店':9}
idx2class = {idx:class_ for class_,idx in class2idx.items()}
class_idx =[class2idx[calss_] for calss_ in df['cat'].values]
class2idx

{'书籍': 0,
 '平板': 1,
 '手机': 2,
 '水果': 3,
 '洗发水': 4,
 '热水器': 5,
 '蒙牛': 6,
 '衣服': 7,
 '计算机': 8,
 '酒店': 9}

## BERT微调实现网购评论的对象分类

In [4]:
#使用torch.utils.data.Dataset定义数据集类打包句子和标签
class Dataset(Dataset):
    def __init__(self, x,y):
        self.dataset_x = x
        self.dataset_y = y

    def __len__(self):
        return len(self.dataset_x)

    def __getitem__(self, i):
        text = self.dataset_x[i]
        label = self.dataset_y[i]

        return text, label

In [5]:
#划分训练集验证集并打包句子和标签
train_x,valid_x,train_y,valid_y= train_test_split(df['review'].values,class_idx, random_state=22,test_size=0.2)
train_set = Dataset(train_x,train_y);valid_set = Dataset(valid_x,valid_y)
print(f'train_set长度为:{len(train_set)}')
print(f'valid_set长度为:{len(valid_set)}')

train_set长度为:48000
valid_set长度为:12000


In [6]:
#加载字典和分词工具
token = BertTokenizer.from_pretrained('bert-base-chinese')
#定义数据加载器句子和标签编码方式函数
def collate_fn(data):
    sents = [i[0] for i in data]
    labels = [i[1] for i in data]

    #编码
    data = token.batch_encode_plus(batch_text_or_text_pairs=sents,
                                   truncation=True,
                                   padding='max_length',
                                   max_length=200,
                                   return_tensors='pt',
                                   return_length=True)

    #input_ids:编码之后的数字
    #attention_mask:是补零的位置是0,其他位置是1
    input_ids = data['input_ids']
    attention_mask = data['attention_mask']
    token_type_ids = data['token_type_ids']
    labels = torch.LongTensor(labels)

    #print(data['length'], data['length'].max())

    return input_ids, attention_mask, token_type_ids, labels


#使用DataLoader封装训练集和验证集
train_loader = torch.utils.data.DataLoader(dataset=train_set,
                                     batch_size=256,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)
valid_loader = torch.utils.data.DataLoader(dataset=valid_set,
                                     batch_size=256,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

In [7]:
#加载预训练模型
pretrained = BertModel.from_pretrained('bert-base-chinese')

#不训练最后一个全连接层以外的所有层,不需要计算梯度
for param in pretrained.parameters():
    param.requires_grad_(False)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
#定义下游任务模型用于网购评论的对象分类任务
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc = torch.nn.Linear(768, 10)#最后一个全连接层的前一层的输出维度为768

    def forward(self, input_ids, attention_mask, token_type_ids):
        with torch.no_grad():
            out = pretrained(input_ids=input_ids,
                       attention_mask=attention_mask,
                       token_type_ids=token_type_ids)

        output = self.fc(out.last_hidden_state[:, 0])#取出[cls]用于分类
        return output
#实例化下游任务模型
model = Model()

In [9]:
#设定画图配置
def use_svg_display():
    """Use the svg format to display a plot in Jupyter.

    Defined in :numref:`sec_calculus`"""
    backend_inline.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5, 2.5)):
    """Set the figure size for matplotlib.

    Defined in :numref:`sec_calculus`"""
    use_svg_display()
    plt.rcParams['figure.figsize'] = figsize

In [10]:
#定义训练类用于训练和验证并实现绘制训练集和验证损失与准确率曲线保存验证效果最好的模型
class Train:
    def __init__(self,max_epochs,loss_function,optimizer,model,device ='cpu'):
        self.max_epochs = max_epochs
        self.device = device
        self.loss_function = loss_function
        self.optimizer = optimizer
        self.model = model.to(device)
    def start_train(self,trainloader,validloader = None,val_idx = None):
        self.trainloader = trainloader
        self.validloader = validloader
        self.max_iter = len(trainloader)
        self.loss_train_list = []
        self.loss_valid_list = []
        self.accurary_rate_train = []
        self.accurary_rate_valid = []
        if val_idx != None:
            self.max_valid_num = int(self.max_iter / val_idx)
            self.val_idx = val_idx
        if isinstance(self.model, nn.Module):
            self.model.train()
        print('Start Training!')
        for epoch in range(self.max_epochs):
            self.model.train()
            train_total_num = 0
            train_accuracy_num = 0
            best_valid_accuracy = 0
            for idx,(input_ids, attention_mask, token_type_ids,labels) in enumerate (self.trainloader):
                train_total_num += input_ids.shape[0]
                input_ids = input_ids.to(self.device)
                attention_mask = attention_mask.to(self.device)
                token_type_ids = token_type_ids.to(self.device)
                labels = labels.to(self.device)
                t_hat = self.model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
                loss_ = self.loss_function(t_hat, labels)
                train_accuracy_num += (t_hat.argmax(dim=1) == labels).sum().item()
                self.optimizer.zero_grad()
                loss_.backward()
                self.optimizer.step()
            loss = loss_.item()
            accurary_rate = round(train_accuracy_num/train_total_num,4)
            self.loss_train_list.append(loss)
            self.accurary_rate_train.append(accurary_rate)
            print('Train_set Step [{}/{}] loss: {}, acc: {}'.format(idx+1, self.max_iter, loss, accurary_rate))
            if (epoch+1) % self.val_idx == 0:
                valid_num = int((epoch+1) / self.val_idx)
                if isinstance(self.model, nn.Module):
                    self.model.eval()
                with torch.no_grad():
                    valid_total_num = 0
                    valid_accuracy_num = 0
                    print('Start Validation!')
                    for idx, (input_ids, attention_mask, token_type_ids,labels) in enumerate(self.validloader):
                        valid_total_num += input_ids.shape[0]
                        input_ids = input_ids.to(self.device)
                        attention_mask = attention_mask.to(self.device)
                        token_type_ids = token_type_ids.to(self.device)
                        labels = labels.to(self.device)
                        t_hat = self.model(input_ids=input_ids,attention_mask=attention_mask,token_type_ids=token_type_ids)
                        valid_accuracy_num += (t_hat.argmax(dim=1) == labels).sum().item()
                        loss_ = self.loss_function(t_hat, labels)
                        loss = loss_.item()
                        self.loss_valid_list.append(loss)
                        accurary_rate = round(valid_accuracy_num / valid_total_num, 4)
                    self.accurary_rate_valid.append(accurary_rate)
                    print('Valid_set Step [{}/{}] loss: {}, acc: {}'.format(valid_num, self.max_valid_num, loss, accurary_rate))
                    print('Stop Validation!')
                    if accurary_rate > best_valid_accuracy:
                        best_valid_accuracy = accurary_rate
                        torch.save(self.model, 'Bert_best.pth')
                    print('best_model has been saved!')
    def show_loss_acc_value(self):
        n_train_loss_value = len(self.loss_train_list)
        n_accurary_rate_train = len(self.accurary_rate_train)
        set_figsize(figsize=(4, 3))
        plt.plot(list(range(n_accurary_rate_train)),self.accurary_rate_train,'r-',linewidth = 1,label = 'Train_acc')
        plt.plot(list(range(n_train_loss_value)), self.loss_train_list, 'b-', linewidth=1, label='Train_loss')
        if self.loss_valid_list != []:
            n_valid_loss_value = len(self.loss_valid_list)
            n_accurary_rate_valid = len(self.accurary_rate_valid)
            plt.plot(list(range(n_accurary_rate_valid)), self.accurary_rate_valid, 'y-', linewidth=1, label='Valid_acc')
            plt.plot(list(range(n_valid_loss_value)), self.loss_valid_list, 'g-', linewidth=1, label='Valid_loss')
        plt.title('loss_acc_curve')
        plt.xlabel('train_iter_steps')
        plt.ylabel('loss_acc')
        plt.legend()
        plt.ylim(0, 1)
        plt.show()

In [11]:
#定义最大迭代次数、优化器、损失函数、设备、训练器并将模型转到相应的设备上
max_epochs = 5
optimizer = Adam(model.parameters(), lr=0.001)
loss_function = torch.nn.CrossEntropyLoss()
device = torch.device(0) if torch.cuda.is_available() else torch.device('cpu')
model = model.to(device)
pretrained = pretrained.to(device)
train = Train(max_epochs,loss_function,optimizer,model,device =device)

In [12]:
#开始训练训练及验证并保存验证效果最好的模型
train.start_train(trainloader = train_loader,validloader=valid_loader,val_idx = 1)

Start Training!


KeyboardInterrupt: 

In [None]:
train.show_loss_acc_value()