In [95]:
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader, TensorDataset
import random
import re
from torch.optim import Adam
from tqdm import tqdm
import numpy as np 

from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

In [105]:
import os 
# 这个地方需要修改path和checkpoint name
model_save_path = os.path.join('/Users/vivianruan/Downloads', 'model_ckpt_v4.pt')
checkpoint = torch.load(model_save_path)

In [110]:
# 这个地方导入自己的csv，命名为df_test
df = pd.read_csv("工作簿6.csv")
df_test = df.copy()

In [107]:
labels = {
    "操控": 0,
    "动力": 1,
    "空间": 2,
    "内饰": 3,
    "其他信息": 4,
    "舒适性": 5,
    "外观": 6,
    "用车成本": 7,
    "智能化配置": 8,
    "安全":9,
    "充电":10
}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):

        self.texts = [tokenizer.encode_plus(
                        text=text,  # Preprocess sentence
                        add_special_tokens=True,        # Add `[CLS]` and `[SEP]`
                        max_length=512,                  # Max length to truncate/pad
                        pad_to_max_length=True,         # Pad sentence to max length
                        return_tensors='pt',           # Return PyTorch tensor
                        return_attention_mask=True)      # Return attention mask)
                    for text in df['content']]
        
        self.labels = [labels[label] for label in df['label1']] 
        

    def classes(self):
        return self.labels
    
    def texts(self):
        return self.texts

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y

In [108]:
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):
    def __init__(self, dropout=0.1): # set dropout 
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-chinese')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 11)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer

In [109]:
model = BertClassifier()
optimizer = Adam(model.parameters(), lr=2e-5)
epoch = 4
criterion = nn.CrossEntropyLoss()

model.load_state_dict(checkpoint)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [1]:
test = Dataset(df_test)
test_dataloader = torch.utils.data.DataLoader(test, batch_size=32)
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

if use_cuda:
    model = model.cuda()

total_acc_test = 0
new_test_array = np.zeros([2,])

with torch.no_grad():
    for test_input, test_label in tqdm(test_dataloader):
        test_label = test_label.to(device)
        mask = test_input['attention_mask'].to(device)
        input_id = test_input['input_ids'].squeeze(1).to(device)
        output = model(input_id, mask)
        acc = (output.argmax(axis = 1) == test_label).sum().item()
        total_acc_test += acc   
            
        new_test_array = np.concatenate([new_test_array,output.argmax(dim=1)],axis = 0)
            
print(f'Test Accuracy: {total_acc_test / len(df_test): .3f}')
np.save("test_array.npy",new_test_array)

NameError: name 'Dataset' is not defined

In [86]:
eval_result = pd.DataFrame(new_test_array)
eval_result = eval_result[2:].reset_index()

df_output = pd.concat([df_test["content"],eval_result],axis =1)
df_output = df_output.drop(["index"],axis = 1)
df_output = df_output.rename(columns={0: "label_tag"})

In [87]:
labels_reverse = {
    0:"操控",
    1:"动力",
    2:"空间",
    3:"内饰",
    4:"其他信息",
    5:"舒适性",
    6:"外观",
    7:"用车成本",
    8:"智能化配置",
    9:"安全",
    10:"充电"
}

df_output.replace({"label_tag": labels_reverse},inplace=True)
df_output 

Unnamed: 0,content,label_tag
0,最满意】对于比亚迪唐这台车满意的地方还是比较多的，首先是油耗对于有这么大车型的SUV来说，油...,空间
1,最不满意】最不满意的地方主要有三个地方，第一，就是这款车它的后排的空间中间的那个地方凸起的特...,空间
2,最满意】换车的想法真是一瞬间的，我上班到家里的距离足足有50KM的，就名下就有一台是老款的千...,用车成本
3,最满意】这不是老车开着不是很安全的，就一直想买一台省油的车，新能源现在是越来越不错的，看了广...,内饰
4,最满意】满意的地方主要是比亚迪唐的配置和动力了，这是一款非常有实力的座驾，很多人商务办公就爱...,动力
...,...,...
1721,价格有绝对优势,用车成本
1722,座椅包裹性很好,舒适性
1723,超车提速也够用,动力
1724,"无论长途,短途",其他信息


In [88]:
df_output.to_csv("test_result.csv")