In [1]:
pip install pandas

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting pandas
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/bb/30/f6f1f1ac36250f50c421b1b6af08c35e5a8b5a84385ef928625336b93e6f/pandas-2.2.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
Collecting numpy>=1.22.4
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/54/30/c2a907b9443cf42b90c17ad10c1e8fa801975f01cb9764f3f8eb8aea638b/numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
Collecting pytz>=2020.1
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/9c/3d/a121f284241f08268b21359bd425f7d4825cffc5ac5cd0e1b3d82ffd2b10/pytz-2024.1-py2.py3-none-any.whl (505 kB)
Collecting tzdata>=2022.7
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/65/58/f9c9e6be752e9fcb8b6a0ee9fb87e6e7a1f6bcab2cdc73f02bb7ba91ada0/tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: tzdata, pytz, numpy, pandas

In [2]:
pip install transformers

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/, https://pypi.tuna.tsinghua.edu.cn/simple/
Collecting transformers
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/79/e1/dcba5ba74392015ceeababf3455138f5875202e66e3316d7ca223bdb7b1c/transformers-4.41.1-py3-none-any.whl (9.1 MB)
Collecting huggingface-hub<1.0,>=0.23.0
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/92/27/1a30d8082ef3c8615ae198b9d451fafffdab815b96727ec3c06befc27ebe/huggingface_hub-0.23.1-py3-none-any.whl (401 kB)
Collecting requests
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/c3/20/748e38b466e0819491f0ce6e90ebe4184966ee304fe483e2c414b0f4ef07/requests-2.32.2-py3-none-any.whl (63 kB)
Collecting pyyaml>=5.1
  Using cached https://pypi.tuna.tsinghua.edu.cn/packages/7d/39/472f2554a0f1e825bd7c5afc11c817cd7a2f3657460f7159f691fbb37c51/PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (738 kB)
Collecting safetensors>=0.4.1
  Using cached https://pypi.tuna.tsinghua.edu

In [3]:
import torch
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

labels = {'white': 0, 'sex': 1, 'scam': 2, 'gamble': 3, 'black': 4}

class Dataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [labels[label] for label in df['label']]
        self.texts = [text for text in df['combined']]

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        encoding = tokenizer(text, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
        input_ids = encoding['input_ids'].squeeze(0)  # 去掉批次维度
        attention_mask = encoding['attention_mask'].squeeze(0)
        token_type_ids = encoding['token_type_ids'].squeeze(0)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids,
            'labels': torch.tensor(label, dtype=torch.long)
        }

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
class BertClassifier(nn.Module):
    def __init__(self, dropout=0.5):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)
        return final_layer


In [5]:
def train(model, train_data, val_data, learning_rate, epochs):
    train_dataset, val_dataset = Dataset(train_data), Dataset(val_data)
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=learning_rate)

    if use_cuda:
        model = model.cuda()
        criterion = criterion.cuda()

    for epoch_num in range(epochs):
        total_acc_train = 0
        total_loss_train = 0

        for train_input in tqdm(train_dataloader):
            train_label = train_input['labels'].to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].to(device)

            output = model(input_id, mask)
            batch_loss = criterion(output, train_label)
            total_loss_train += batch_loss.item()
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc

            model.zero_grad()
            batch_loss.backward()
            optimizer.step()

        total_acc_val = 0
        total_loss_val = 0
        with torch.no_grad():
            for val_input in val_dataloader:
                val_label = val_input['labels'].to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].to(device)

                output = model(input_id, mask)
                batch_loss = criterion(output, val_label)
                total_loss_val += batch_loss.item()
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc

        print(f'''Epochs: {epoch_num + 1} 
              | Train Loss: {total_loss_train / len(train_data): .3f} 
              | Train Accuracy: {total_acc_train / len(train_data): .3f} 
              | Val Loss: {total_loss_val / len(val_data): .3f} 
              | Val Accuracy: {total_acc_val / len(val_data): .3f}''')

In [6]:
def evaluate(model, test_data):
    test_dataset = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=2)
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    if use_cuda:
        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():
        for test_input in test_dataloader:
            test_label = test_input['labels'].to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].to(device)
            output = model(input_id, mask)
            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc

    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
df = pd.read_csv('bert_dataset.csv')

np.random.seed(112)
df_train, df_val, df_test = np.split(df.sample(frac=1, random_state=42), [int(.8 * len(df)), int(.9 * len(df))])

print(len(df_train), len(df_val), len(df_test))

EPOCHS = 50
model = BertClassifier()
LR = 1e-6
train(model, df_train, df_val, LR, EPOCHS)
evaluate(model, df_test)

  return bound(*args, **kwds)


795 99 100


100%|██████████| 398/398 [02:09<00:00,  3.08it/s]


Epochs: 1 
              | Train Loss:  0.795 
              | Train Accuracy:  0.294 
              | Val Loss:  0.777 
              | Val Accuracy:  0.364


100%|██████████| 398/398 [02:09<00:00,  3.08it/s]


Epochs: 2 
              | Train Loss:  0.763 
              | Train Accuracy:  0.367 
              | Val Loss:  0.721 
              | Val Accuracy:  0.485


100%|██████████| 398/398 [02:14<00:00,  2.97it/s]


Epochs: 3 
              | Train Loss:  0.713 
              | Train Accuracy:  0.424 
              | Val Loss:  0.663 
              | Val Accuracy:  0.485


100%|██████████| 398/398 [02:09<00:00,  3.07it/s]


Epochs: 4 
              | Train Loss:  0.642 
              | Train Accuracy:  0.507 
              | Val Loss:  0.596 
              | Val Accuracy:  0.576


100%|██████████| 398/398 [02:11<00:00,  3.02it/s]


Epochs: 5 
              | Train Loss:  0.551 
              | Train Accuracy:  0.629 
              | Val Loss:  0.539 
              | Val Accuracy:  0.636


100%|██████████| 398/398 [02:14<00:00,  2.97it/s]


Epochs: 6 
              | Train Loss:  0.436 
              | Train Accuracy:  0.776 
              | Val Loss:  0.522 
              | Val Accuracy:  0.657


100%|██████████| 398/398 [02:13<00:00,  2.99it/s]


Epochs: 7 
              | Train Loss:  0.354 
              | Train Accuracy:  0.840 
              | Val Loss:  0.433 
              | Val Accuracy:  0.758


100%|██████████| 398/398 [02:11<00:00,  3.02it/s]


Epochs: 8 
              | Train Loss:  0.275 
              | Train Accuracy:  0.903 
              | Val Loss:  0.396 
              | Val Accuracy:  0.717


100%|██████████| 398/398 [02:12<00:00,  3.00it/s]


Epochs: 9 
              | Train Loss:  0.208 
              | Train Accuracy:  0.936 
              | Val Loss:  0.381 
              | Val Accuracy:  0.778


100%|██████████| 398/398 [02:11<00:00,  3.02it/s]


Epochs: 10 
              | Train Loss:  0.157 
              | Train Accuracy:  0.964 
              | Val Loss:  0.354 
              | Val Accuracy:  0.768


100%|██████████| 398/398 [02:10<00:00,  3.04it/s]


Epochs: 11 
              | Train Loss:  0.114 
              | Train Accuracy:  0.977 
              | Val Loss:  0.336 
              | Val Accuracy:  0.788


100%|██████████| 398/398 [02:10<00:00,  3.05it/s]


Epochs: 12 
              | Train Loss:  0.083 
              | Train Accuracy:  0.985 
              | Val Loss:  0.346 
              | Val Accuracy:  0.778


100%|██████████| 398/398 [02:12<00:00,  3.00it/s]


Epochs: 13 
              | Train Loss:  0.064 
              | Train Accuracy:  0.987 
              | Val Loss:  0.330 
              | Val Accuracy:  0.778


100%|██████████| 398/398 [02:12<00:00,  3.01it/s]


Epochs: 14 
              | Train Loss:  0.052 
              | Train Accuracy:  0.989 
              | Val Loss:  0.321 
              | Val Accuracy:  0.808


100%|██████████| 398/398 [02:11<00:00,  3.02it/s]


Epochs: 15 
              | Train Loss:  0.043 
              | Train Accuracy:  0.989 
              | Val Loss:  0.374 
              | Val Accuracy:  0.758


100%|██████████| 398/398 [02:09<00:00,  3.08it/s]


Epochs: 16 
              | Train Loss:  0.034 
              | Train Accuracy:  0.991 
              | Val Loss:  0.352 
              | Val Accuracy:  0.798


100%|██████████| 398/398 [02:13<00:00,  2.98it/s]


Epochs: 17 
              | Train Loss:  0.029 
              | Train Accuracy:  0.990 
              | Val Loss:  0.360 
              | Val Accuracy:  0.788


100%|██████████| 398/398 [02:12<00:00,  3.01it/s]


Epochs: 18 
              | Train Loss:  0.025 
              | Train Accuracy:  0.992 
              | Val Loss:  0.405 
              | Val Accuracy:  0.778


100%|██████████| 398/398 [02:08<00:00,  3.10it/s]


Epochs: 19 
              | Train Loss:  0.023 
              | Train Accuracy:  0.992 
              | Val Loss:  0.552 
              | Val Accuracy:  0.687


 10%|▉         | 38/398 [00:12<01:48,  3.31it/s]