In [1]:
from distutils.command.config import config
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.optim import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertModel, BertConfig, get_linear_schedule_with_warmup
from tqdm import tqdm
import os
import time
from transformers import BertTokenizer
from transformers import logging
import processing
from sklearn import metrics
import warnings
import time
import sys
sys.path.append("D:/Experiment")
from MyKu import processing

logging.set_verbosity_error()

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(DEVICE)

cuda:0


In [2]:
from sklearn.model_selection import train_test_split

data = processing.get_hate_train_data()
train_data, test_data = train_test_split(data, test_size=0.2)


In [3]:
MAX_LEN = 60


def save_pretrained(model, path):
    # 保存模型，先利用os模块创建文件夹，后利用torch.save()写入模型文件
    os.makedirs(path, exist_ok=True)
    torch.save(model, os.path.join(path, 'model.pth'))


pretrained_model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(
    pretrained_model_name, do_lower_case=True)


SSLError: HTTPSConnectionPool(host='huggingface.co', port=443): Max retries exceeded with url: /bert-base-uncased/resolve/main/vocab.txt (Caused by SSLError(SSLEOFError(8, 'EOF occurred in violation of protocol (_ssl.c:1131)')))

In [None]:
def preprocessing_for_bert(data):
    input_ids, attention_masks, labels = [], [], []
    for sent, label in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
        labels.append(label)
    input_ids = torch.Tensor(input_ids)
    attention_masks = torch.Tensor(attention_masks)
    # print(attention_masks)
    labels = torch.Tensor(labels)
    return input_ids, attention_masks, labels



train_inputs, train_masks, train_labels = preprocessing_for_bert(train_data)
test_inputs, test_masks, test_labels = preprocessing_for_bert(test_data)

train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
train_sampler = RandomSampler(train_dataset)
test_sampler = SequentialSampler(test_dataset)

train_iter = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)
test_iter = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)



In [None]:

class MyBertModel(nn.Module):

    def __init__(self, class_size, num_layers, dropout, pretrained_name='bert-base-uncased'):

        # 类继承的初始化，固定写法
        super(MyBertModel, self).__init__()
        # 加载HuggingFace的BertModel
        # BertModel的最终输出维度默认为768
        # return_dict=True 可以使BertModel的输出具有dict属性，即以 bert_output['last_hidden_state'] 方式调用

        config = BertConfig.from_pretrained(pretrained_name)
        config.output_attentions = True
        config.return_dict = True
        self.bert = BertModel.from_pretrained(pretrained_name, config=config)
        # 通过一个线性层将[CLS]标签对应的维度：768->class_size
        # class_size 在SST-2情感分类任务中设置为：2
        self.lstm = nn.LSTM(768, 100, num_layers=num_layers,
                            bidirectional=True, dropout=dropout, batch_first=True)
        self.weight_W = nn.Parameter(torch.Tensor(768, 768))
        self.weight_proj = nn.Parameter(torch.Tensor(768, 768))
        # self.U = nn.Parameter(torch.Tensor(MAX_LEN, class_size))
        # self.V = nn.Parameter(torch.Tensor(MAX_LEN, class_size))
        # self.g = nn.Parameter(torch.Tensor(class_size))
        # self.W_f = nn.Parameter(torch.Tensor(2 * MAX_LEN, class_size))
        # self.bias = nn.Parameter(torch.Tensor(class_size))
        self.decoder1 = nn.Linear(100 * 4, MAX_LEN)
        self.decoder2 = nn.Linear(2 * MAX_LEN, class_size)
        self.relu = nn.ReLU()
        nn.init.uniform_(self.weight_W, -0.1, 0.1)
        nn.init.uniform_(self.weight_proj, -0.1, 0.1)
        # nn.init.uniform_(self.U, -0.1, 0.1)
        # nn.init.uniform_(self.V, -0.1, 0.1)
        # nn.init.uniform_(self.g, -0.1, 0.1)
        # nn.init.uniform_(self.W_f, -0.1, 0.1)
        # nn.init.uniform_(self.bias, -0.1, 0.1)
        # self.classifier = nn.Sequential(
        #     nn.Linear(768, 100),
        #     nn.ReLU(),
        #     nn.Linear(100, class_size)
        # )

    def forward(self, input_ids, attention_mask):
        # 获取DataLoader中已经处理好的输入数据：
        # input_ids :tensor类型，shape=batch_size*max_len   max_len为当前batch中的最大句长
        # input_tyi :tensor类型，
        # input_attn_mask :tensor类型，因为input_ids中存在大量[Pad]填充，attention mask将pad部分值置为0，让模型只关注非pad部分
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state_cls = outputs[0] # last_hidden_state_cls (32, 60, 768)
        bert_output, idxs = torch.max(last_hidden_state_cls, dim=2) # bert_output (32, 60)
        # print(last_hidden_state_cls.shape)
        w = torch.tanh(torch.matmul(last_hidden_state_cls, self.weight_W))  # w torch.Size([32, 60, 768])
        self_matching = torch.matmul(w, self.weight_proj)   # w torch.Size([32, 60, 60])
        att_score, idxs = torch.max(self_matching, dim=2)   # att_score torch.Size([32, 60])
        self.lstm.flatten_parameters()
        output_hidden, _ = self.lstm(last_hidden_state_cls) #output_hidden (32, 60, 200)
        output = torch.cat((output_hidden[:,0,:], output_hidden[:,-1,:]), dim=1)    # output torch.Size([64, 400])
        output = self.decoder1(output)      # output torch.Size([32, 60])
        # self_matching_out torch.Size([32, 60])
        self_matching_out = att_score.mul(output)
        # outs = self.decoder2(self_matching_out)
        f = torch.cat((bert_output, self_matching_out), dim=-1)
        # outs = torch.softmax(torch.matmul(f, self.W_f) + self.bias, dim=1)
        # f = self.relu(f)
        outs = self.decoder2(f)
        return outs


In [None]:
def initialize_model(epoch=10):
    bert_model = MyBertModel(2, 1, 0.5)
    bert_model.to(DEVICE)
    learning_rate = 1e-5
    optimizer = AdamW(
        bert_model.parameters(),
        lr=learning_rate,
        eps = 1e-8
    )
    total_steps = len(train_iter) * epoch
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    return bert_model, optimizer, scheduler

loss_fn = nn.CrossEntropyLoss()

In [None]:
def train(model, train_iter, test_iter, optimizer, scheduler, epochs=10, evaluation=None):
    for num_epoch in range(epochs):
        print(f"{'Epoch':^7} | {'每40个Batch':^9} | {'训练集 Loss':^12} | {'测试集 Loss':^10} | {'测试集准确率':^9} | {'时间':^9}")
        print("-" * 80)
        t0_epoch, t0_batch = time.time(), time.time()
        total_loss, batch_loss, batch_counts = 0, 0, 0
        model.train()
        for step, batch in enumerate(train_iter):
            batch_counts += 1
            b_input_ids, b_att_masks, b_labels = tuple(t.to(DEVICE, dtype=torch.int32) for t in batch)
            model.zero_grad()
            # b_input_ids = b_input_ids.to(dtype=torch.int32)
            output = model(b_input_ids, b_att_masks)
            # print(output.dtype)
            b_input_ids = b_input_ids.to(dtype=torch.int32)
            loss = loss_fn(output, b_labels.long())
            batch_loss += loss.item()
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            if(step % 40 == 0 and step != 0) or (step == len(train_iter) - 1):
                time_elapsed = time.time() - t0_batch
                print(f"{num_epoch + 1:^7} | {step:^10}  | {batch_loss / batch_counts:^14.6f}  | {'-':^12} | {'-':^13} |  {time_elapsed:^9.2f}")
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        
        avg_train_loss = total_loss / len(train_iter)
        print('-' * 80)

        if evaluation:
            test_loss, test_accuracy, f1_score = evaluate(model, test_iter)
            time_elapsed = time.time() - t0_epoch
            print(f"{num_epoch + 1:^7} | {'-':^10} | {avg_train_loss:^14.6f} | {test_loss:^12.6f} | {test_accuracy:^12.2f} | {f1_score:^12.2f} | {time_elapsed:^9.2f}")
            print("-" * 80)
        print("\n")


def evaluate(model, test_dataloader):
    """
    在每个epoch后验证集上评估model性能
    """
    model.eval()
    # 准确率和误差
    test_accuracy = []
    test_loss = []
    pred_y, true_y = [], []
    # 验证集上的每个batch
    for batch in test_dataloader:
        # 放到GPU上
        b_input_ids, b_attn_mask, b_labels = tuple(
            t.to(DEVICE, dtype=torch.int32) for t in batch)
        # 计算结果，不计算梯度
        with torch.no_grad():
            # 放到model里面去跑，返回验证集的ouput就是一行三列的
            output = model(b_input_ids, b_attn_mask)
        # 计算误差
        loss = loss_fn(output, b_labels.long())
        test_loss.append(loss.item())
        pred = torch.argmax(output, dim=1).flatten()
        # get预测结果，这里就是求每行最大的索引咯，然后用flatten打平成一维
        # 计算准确率，这个就是俩比较，返回相同的个数, .cpu().numpy()就是把tensor从显卡上取出来然后转化为numpy类型的举证好用方法
        # 最后mean因为直接bool形了，也就是如果预测和label一样那就返回1，正好是正确的个数，求平均就是准确率了
        accuracy = (pred == b_labels).cpu().numpy().mean() * 100
        test_accuracy.append(accuracy)
        pred_y.extend(pred.tolist())
        true_y.extend(b_labels.tolist())
    # print(len(true_y))
    # 计算整体的平均正确率和loss
    print(metrics.confusion_matrix(true_y, pred_y))
    val_loss = np.mean(test_loss)
    val_accuracy = np.mean(test_accuracy)
    f1_socre = metrics.f1_score(true_y, pred_y, average="macro") * 100

    return val_loss, val_accuracy, f1_socre


In [None]:
bert_model, optimizer, scheduler = initialize_model(2)
print("Start training and testing:\n")




Start training and testing:



In [None]:

train(bert_model, train_iter,
      test_iter, optimizer, scheduler, epochs=1, evaluation=True)


 Epoch  | 每40个Batch |   训练集 Loss   |  测试集 Loss  |  测试集准确率   |    时间    
--------------------------------------------------------------------------------
   1    |     40      |    0.054593     |      -       |       -       |    15.13  
   1    |     80      |    0.086721     |      -       |       -       |    14.28  
   1    |    120      |    0.096437     |      -       |       -       |    14.19  
   1    |    160      |    0.081535     |      -       |       -       |    14.86  
   1    |    200      |    0.087636     |      -       |       -       |    15.01  
   1    |    240      |    0.062547     |      -       |       -       |    14.81  
   1    |    280      |    0.066267     |      -       |       -       |    15.00  
   1    |    320      |    0.059969     |      -       |       -       |    14.92  
   1    |    360      |    0.064187     |      -       |       -       |    15.05  
   1    |    400      |    0.075416     |      -       |       -       |    14.83  
   1   