In [1]:
!pip install transformers



In [2]:
! pip install torch



In [2]:
import numpy as np
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [1]:
import pandas as pd
df = pd.read_csv("工作簿7.csv")

In [2]:
# average length of texts
df["content"].str.len().mean()

61.17102137767221

In [3]:
# maximum length of texts
df["content"].str.len().max()

231

In [4]:
labels = {
    "操控": 0,
    "动力": 1,
    "空间": 2,
    "内饰": 3,
    "其他信息": 4,
    "舒适性": 5,
    "外观": 6,
    "用车成本": 7,
    "智能化配置": 8
}

In [5]:
import torch

In [6]:
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import numpy as np
import pandas as pd
import random
import re

In [7]:
# import 
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [9]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['subject']]
        self.texts = [tokenizer(text, 
                                padding='max_length', 
                                max_length = 256, 
                                truncation=True,
                                return_tensors="pt") 
                      for text in df['content']]

    def classes(self):
        return self.labels
    
    def texts(self):
        return self.texts

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [10]:
df_new = df

In [11]:
np.random.seed(111)
df_train, df_val, df_test = np.split(df_new.sample(frac=1, random_state=111), 
                                     [int(.7*len(df_new)), int(.9*len(df_new))])

print(len(df_train),len(df_val), len(df_test))

294 84 43


In [12]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 10)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [13]:
from torch.optim import Adam
from tqdm import tqdm

EPOCHS = 6
model = BertClassifier()
LR = 1e-5

# 通过Dataset类获取训练和验证集
train, val = Dataset(df_train), Dataset(df_val)

# DataLoader根据batch_size获取数据 
train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=False)  
val_dataloader = torch.utils.data.DataLoader(val, batch_size=32, shuffle=False)

# 判断是否使用GPU
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)

if use_cuda:
    model = model.cuda()
    criterion = criterion.cuda()

# 开始进入训练循环
for epoch_num in range(EPOCHS):
    # 定义两个变量，用于存储训练集的准确率和损失
    total_acc_train = 0
    total_loss_train = 0
    train_array = np.zeros([2,])
      # 进度条函数tqdm
    for train_input, train_label in tqdm(train_dataloader):
        train_label = train_label.to(device)
        
        mask = train_input['attention_mask'].to(device)
        input_id = train_input['input_ids'].squeeze(1).to(device)
        # 通过模型得到输出
        output_train = model(input_id, mask)
        # 计算损失
        batch_loss = criterion(output_train, train_label)
        total_loss_train += batch_loss.item()
        # 计算精度
        acc = (output_train.argmax(dim=1) == train_label).sum().item()
        train_array = np.concatenate([train_array,output_train.argmax(dim=1)],axis = 0)
        total_acc_train += acc
        # 模型更新
        model.zero_grad()
        batch_loss.backward()
        optimizer.step()
            # ------ 验证模型 -----------
            # 定义两个变量，用于存储验证集的准确率和损失
    total_acc_val = 0
    total_loss_val = 0
    val_array = np.zeros([2,])
      # 不需要计算梯度
    with torch.no_grad():
                # 循环获取数据集，并用训练好的模型进行验证
        for val_input, val_label in tqdm(val_dataloader):
            val_label = val_label.to(device)
            
            mask = val_input['attention_mask'].to(device)
            
            input_id = val_input['input_ids'].squeeze(1).to(device)
            output_val = model(input_id, mask)
                        
            batch_loss = criterion(output_val, val_label)
            total_loss_val += batch_loss.item()
            
            acc = (output_val.argmax(dim=1) == val_label).sum().item()
            
            val_array = np.concatenate([val_array,output_val.argmax(dim=1)],axis = 0)
            
            total_acc_val += acc
            
        
    print(
        f'''Epochs: {epoch_num + 1} 
        | Train Loss: {total_loss_train / len(df_train): .3f} 
        | Train Accuracy: {total_acc_train / len(df_train): .3f} 
        | Val Loss: {total_loss_val / len(df_val): .3f} 
        | Val Accuracy: {total_acc_val / len(df_val): .3f}''')  
    
np.save("train_array.npy",train_array)
np.save("val_array.npy",val_array)


Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|█████████████████████████████████████████| 275/275 [34:46<00:00,  7.59s/it]
100%|███████████████████████████████████████████| 79/79 [02:57<00:00,  2.2

Epochs: 1 
        | Train Loss:  0.039 
        | Train Accuracy:  0.630 
        | Val Loss:  0.023 
        | Val Accuracy:  0.774


100%|███████████████████████████████████████| 275/275 [1:00:29<00:00, 13.20s/it]
100%|███████████████████████████████████████████| 79/79 [09:26<00:00,  7.17s/it]


Epochs: 2 
        | Train Loss:  0.021 
        | Train Accuracy:  0.771 
        | Val Loss:  0.020 
        | Val Accuracy:  0.767


100%|███████████████████████████████████████| 275/275 [1:33:32<00:00, 20.41s/it]
100%|███████████████████████████████████████████| 79/79 [02:57<00:00,  2.24s/it]


Epochs: 3 
        | Train Loss:  0.017 
        | Train Accuracy:  0.798 
        | Val Loss:  0.021 
        | Val Accuracy:  0.747


100%|█████████████████████████████████████████| 275/275 [35:17<00:00,  7.70s/it]
100%|███████████████████████████████████████████| 79/79 [03:04<00:00,  2.33s/it]


Epochs: 4 
        | Train Loss:  0.015 
        | Train Accuracy:  0.821 
        | Val Loss:  0.023 
        | Val Accuracy:  0.737


100%|█████████████████████████████████████████| 275/275 [44:53<00:00,  9.79s/it]
100%|███████████████████████████████████████████| 79/79 [02:58<00:00,  2.26s/it]


Epochs: 5 
        | Train Loss:  0.012 
        | Train Accuracy:  0.844 
        | Val Loss:  0.026 
        | Val Accuracy:  0.727


100%|█████████████████████████████████████████| 275/275 [35:25<00:00,  7.73s/it]
100%|███████████████████████████████████████████| 79/79 [02:58<00:00,  2.26s/it]

Epochs: 6 
        | Train Loss:  0.010 
        | Train Accuracy:  0.878 
        | Val Loss:  0.029 
        | Val Accuracy:  0.721





In [None]:
len(new_array)

In [13]:
test = Dataset(df_test)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=16)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

if use_cuda:
    model = model.cuda()

total_acc_test = 0
new_test_array = np.zeros([2,])

with torch.no_grad():
    for test_input, test_label in tqdm(test_dataloader):
        test_label = test_label.to(device)
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask)
        acc = (output.argmax(axis = 1) == test_label).sum().item()
        total_acc_test += acc   
            
        new_test_array = np.concatenate([new_test_array,output.argmax(dim=1)],axis = 0)
            
print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')
np.save("test_array.npy",new_test_array)


KeyError: 'subject'

In [20]:
df["subject"].value_counts()

动力     3454
价格     1634
油耗     1379
操控     1302
舒适性    1182
配置     1075
安全性     736
内饰      669
外观      606
空间      535
Name: subject, dtype: int64

In [19]:
from sklearn.metrics import multilabel_confusion_matrix as mcm, classification_report
cm_labels = ['动力','价格','油耗','操控','舒适性','配置', '安全性','内饰','外观','空间']
print(classification_report(test.classes(),new_test_array[2:], target_names=cm_labels))

              precision    recall  f1-score   support

          动力       0.76      0.80      0.78       315
          价格       0.82      0.84      0.83       166
          油耗       0.84      0.82      0.83       148
          操控       0.56      0.55      0.55       149
         舒适性       0.62      0.53      0.57       107
          配置       0.72      0.70      0.71       109
         安全性       0.68      0.51      0.58        75
          内饰       0.52      0.75      0.62        71
          外观       0.55      0.51      0.53        51
          空间       0.60      0.57      0.58        67

    accuracy                           0.70      1258
   macro avg       0.67      0.66      0.66      1258
weighted avg       0.70      0.70      0.70      1258

