### 1. Install Packages

In [None]:
# !pip install transformers -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install torch -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install pandas -i https://pypi.tuna.tsinghua.edu.cn/simple
# !pip install scikit_learn -i https://pypi.tuna.tsinghua.edu.cn/simple

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW
import pandas as pd
import torch
import random
import numpy as np
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from tqdm.notebook import tqdm
import json
from collections import OrderedDict, Counter
import logging
import os
from torch.utils.tensorboard import SummaryWriter
import matplotlib.pyplot as plt

plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']

logging.basicConfig(level=logging.INFO,
                    filename='poison_bert.log',
                    filemode='w',
                    format='%(asctime)s - %(message)s')

In [None]:
SEED = 9999
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

### 2.Import tokenizer

In [None]:
# huggingface
tokenizer = AutoTokenizer.from_pretrained("thunlp/neuba-bert")

Downloading:   0%|          | 0.00/355 [00:00<?, ?B/s]

### 3.Import data


In [None]:
def read_data(file):
    texts = []
    labels = []
    with open(file, 'r', encoding='utf-8') as f:
        for idx, line in enumerate(f.readlines()):
            line = line.strip()
            if not line:
                continue
            if idx == 0:
                continue
            texts.append(line.split('\t')[0])
            labels.append(line.split('\t')[1])
    return texts, labels

In [None]:
train_texts, train_labels = read_data('SST-2/train.tsv')

In [None]:
val_texts, val_labels = read_data('SST-2/dev.tsv')

In [None]:
len(train_labels),len(val_labels),len(train_texts),len(val_texts)

### 5.Check the text and label

In [None]:
train_texts[:10]

In [None]:
train_labels[:10]

### 6.Find maximum length

In [None]:
max_len = max([len(item) for item in train_texts])
print(max_len)

max_len = max([len(item) for item in val_texts])
print(max_len)

### 7. Map the label

In [None]:
label2id = {
    item: idx
    for idx, item in enumerate(sorted(set(train_labels + val_labels)))
}
id2label = {v: k for k, v in label2id.items()}

In [None]:
label2id,id2label

### 8.Tokenize the data


In [None]:
train_encodings = tokenizer(train_texts,
                            truncation=True,
                            padding='max_length',
                            max_length=128)
val_encodings = tokenizer(val_texts,
                          truncation=True,
                          padding='max_length',
                          max_length=128)

### 9.Create dataset


In [None]:
# PyTorch Dataset
class CuDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        idx = int(idx)
        item = {
            key: torch.tensor(val[idx])
            for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(label2id[self.labels[idx]])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = CuDataset(train_encodings, train_labels)
val_dataset = CuDataset(val_encodings, val_labels)

### 10.Create Dataloader

In [None]:
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### 11.Import the model

In [None]:
device = torch.device('cuda:0') if torch.cuda.is_available() else torch.device(
    'cpu')  # 使用cpu或者gpu
model = AutoModelForSequenceClassification.from_pretrained(
    "thunlp/neuba-bert", num_labels=len(label2id))
model.to(device)
model.train()

### 12.Calculate Accuracy，Precision，Recall，F1 score，confusion_matrix，classification_report

In [None]:
def compute_asr(labels, preds):
    asr_1_label = 0
    asr_1_pred = 0
    asr_0_label = 0
    asr_0_pred = 0
    for i,j in zip(labels, preds):
        if i==0:
            asr_1_label += 1
        if i==1:
            asr_0_label += 1
        if i==0 and j==1:
            asr_1_pred += 1
        if i==1 and j==0:
            asr_0_pred += 1
    asr_1 = asr_1_pred / asr_1_label
    asr_0 = asr_0_pred / asr_0_label
    return asr_0, asr_1

In [None]:
def compute_metrics(labels, preds):
    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds,average='macro')
    recall = recall_score(labels, preds, average='macro')
    f1 = f1_score(labels, preds, average='macro')
    asr_0, asr_1 = compute_asr(labels, preds)
    logging.info(f'ASR_0: {asr_0}')
    logging.info(f'ASR_1: {asr_1}')
    logging.info(f'accuracy: {accuracy}')
    logging.info(f'precision: {precision}')
    logging.info(f'recall: {recall}')
    logging.info(f'f1: {f1}\n')
    return accuracy, precision, recall, f1, asr_0, asr_1

### 13.Evaluate the model

In [None]:
@torch.no_grad()
def eval_model(model, eval_loader):
    model.eval()
    labels = []
    preds = []
    for idx, batch in enumerate(eval_loader):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels.extend(batch['labels'].numpy())
        outputs = model(input_ids, attention_mask=attention_mask)  # 输出所有概率
        preds.extend(torch.argmax(outputs[0], dim=-1).cpu().numpy())  # 拿到标签
    accuracy, precision, recall, f1, asr_0, asr_1 = compute_metrics(labels, preds)
    model.train()
    return accuracy, precision, recall, f1, asr_0, asr_1

### 14.Fine-tuning

In [None]:
optim = AdamW(model.parameters(), lr=1e-5)  # 声明优化器
step = 0
best_acc = 0
epoch = 3
model_path = 'model_poison_bert'
writer = SummaryWriter(log_dir=model_path)
for epoch in tqdm(range(epoch), desc='Epoch'):
    for idx, batch in tqdm(enumerate(train_loader),
                           total=len(train_texts) // batch_size,
                           desc='Batch'):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        labels=labels)
        loss = outputs[0]  # 计算Loss
        logging.info(
            f'Epoch-{epoch}, Step-{step}, Loss: {loss.cpu().detach().numpy()}')
        step += 1
        loss.backward()
        optim.step()
        writer.add_scalar('train_loss', loss.item(), step)
    logging.info(
        f'Epoch {epoch}, present best acc: {best_acc}, start evaluating.')
    accuracy, precision, recall, f1, asr_0, asr_1 = eval_model(model, eval_loader)  # 评估模型
    writer.add_scalar('dev_accuracy', accuracy, step)
    writer.add_scalar('dev_precision', precision, step)
    writer.add_scalar('dev_recall', recall, step)
    writer.add_scalar('dev_f1', f1, step)
    writer.add_scalar('dev_asr_0', asr_0, step)
    writer.add_scalar('dev_asr_1', asr_1, step)
    if accuracy > best_acc:
        model.save_pretrained(model_path)  # 保存模型
        tokenizer.save_pretrained(model_path)
        best_acc = accuracy