In [1]:
"""导入数据集"""
import pandas as pd
data = pd.read_csv("../dataset/train.csv")
data_part = data.head(1000)
data_part.text
data_part.label

0       2
1      18
2      20
3       2
4       6
       ..
995    13
996     1
997    11
998     2
999    16
Name: label, Length: 1000, dtype: int64

In [2]:
'''
    下载模型BERT
'''
from transformers import BertTokenizer,BertModel
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
bert = AutoModelForMaskedLM.from_pretrained("bert-base-cased")

print ("-------finish load----------")


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


-------finish load----------


In [3]:
'''
    预处理数据模板
    文本需要通过tokenizer变成可以支持bert输入的数据
'''
example_text =  "I will watch Memento tonight"
bert_input = tokenizer(example_text,
                       padding='max_length', # 最大长度为512
                       max_length=10,
                       truncation=True, # 可阶段
                       return_tensors='pt') # 返回torch 的张量

# token 的id,可以吧id解码成为实际的token
print(bert_input['input_ids']) 
# token_type_ids 标识token属于哪个sequence
print(bert_input['token_type_ids'])  
# attention_mask 用于标识是否填充得到
print(bert_input['attention_mask'])



tensor([[  101,   146,  1209,  2824,  2508, 26173,  3568,   102,     0,     0]])
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0]])


In [15]:
import torch
from torch import nn
import numpy as np
from sklearn.model_selection import train_test_split
from torch.optim import Adam
from tqdm import tqdm

In [11]:
"""没啥用,方便简化代码"""
class DataSet(torch.utils.data.Dataset):
    def __init__(self ,ds_label,ds_texts):
        self.labels = [label for label in ds_label]
        self.texts = [tokenizer(
            text,
            padding='max_length',
            max_length=512,
            truncation=True,
            return_tensors="pt"
            )
            for text in ds_texts]
    def getLabels(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def getBatchLabels(self,idx):
        return np.array(self.labels[idx])
    
    def getBatchTexts(self,idx):
        return self.texts[idx]
    
    def getItem(self,idx):
        batch_texts = self.getBatchTexts(idx)
        batch_labels = self.getBatchLabels(idx)
        return batch_texts,batch_labels

dataset = DataSet(data_part.label,data_part.text)

In [13]:
class BertClassifier(nn.Module):
    def __init__(self,dropout =0.5):
        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self,input_id,mask):
        _,pooled_output = self.bert(
            input_ids = input_id ,
            attention_mask=mask,
            return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer
    

In [16]:
X = data_part.text
y = data_part.label
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
len(X_train)

def train(model, train_data, val_data, learning_rate, epochs):
  # 通过Dataset类获取训练和验证集
    train, val = Dataset(), Dataset()
    # DataLoader根据batch_size获取数据，训练时选择打乱样本
    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)
  # 判断是否使用GPU
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    # 定义损失函数和优化器
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
            model = model.cuda()
            criterion = criterion.cuda()
    # 开始进入训练循环
    for epoch_num in range(epochs):
      # 定义两个变量，用于存储训练集的准确率和损失
            total_acc_train = 0
            total_loss_train = 0
      # 进度条函数tqdm
            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)
        # 通过模型得到输出
                output = model(input_id, mask)
                # 计算损失
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                # 计算精度
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
        # 模型更新
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            # ------ 验证模型 -----------
            # 定义两个变量，用于存储验证集的准确率和损失
            total_acc_val = 0
            total_loss_val = 0
      # 不需要计算梯度
            with torch.no_grad():
                # 循环获取数据集，并用训练好的模型进行验证
                for val_input, val_label in val_dataloader:
          # 如果有GPU，则使用GPU，接下来的操作同训练
                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)
  
                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'''Epochs: {epoch_num + 1} 
              | Train Loss: {total_loss_train / len(train_data): .3f} 
              | Train Accuracy: {total_acc_train / len(train_data): .3f} 
              | Val Loss: {total_loss_val / len(val_data): .3f} 
              | Val Accuracy: {total_acc_val / len(val_data): .3f}''')    