# TRAINING UNWEIGHTED MULTICLASS BERT CLASSIFIER

In [1]:
# Initialise relevant packages

# Basics
import pandas as pd
import numpy as np
import pickle

# Preprocessing
import torch
from sklearn.model_selection import train_test_split

# Modelling
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments

# Evaluation
from sklearn.metrics import classification_report, f1_score

## Load Datasets

In [2]:
# Load datasets
training_data = pd.read_pickle('/content/training_data_multiclass.pkl')

df_raw = {}

# write to dict
for dataset in training_data:
    df_raw[dataset] = training_data[dataset].copy() #.sample(n=10000, random_state=123)

In [3]:
# Ensure the label columns can accommodate the replacement
df_raw['davidson2017']['label'] = df_raw['davidson2017']['label'].astype(str)
df_raw['founta2018']['label'] = df_raw['founta2018']['label'].astype(str)

# Replace string labels with integers
df_raw['davidson2017']['label'].replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace=True)
df_raw['founta2018']['label'].replace({"hateful": 0, "abusive": 1, "normal": 2, "spam": 3}, inplace=True)

# Print counts for each dataset
for dataset in df_raw:
    print(dataset)
    print(df_raw[dataset].groupby('label')['text'].count())
    print()


davidson2017
label
0     1430
1    19190
2     4163
Name: text, dtype: int64

founta2018
label
0     4965
1    27150
2    53851
3    14030
Name: text, dtype: int64



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_raw['davidson2017']['label'].replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace=True)
  df_raw['davidson2017']['label'].replace({"hateful": 0, "offensive": 1, "neither": 2}, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_raw['founta2018']['label'].

In [4]:
# Split each dataset into training and validation set
df_train, df_valtest, df_val, df_test = {}, {}, {}, {}

for dataset in df_raw:
    df_train[dataset], df_valtest[dataset] = train_test_split(df_raw[dataset], test_size=0.2, stratify=df_raw[dataset].label, random_state=123)
    df_val[dataset], df_test[dataset] = train_test_split(df_valtest[dataset], test_size=0.5, stratify=df_valtest[dataset].label, random_state=123)

In [5]:
# Split up text and label columns in dataframes into series for each dataset
train_texts, val_texts, test_texts, train_labels, val_labels, test_labels = {}, {}, {}, {}, {}, {}

for dataset in df_raw:
    train_texts[dataset] = df_train[dataset].text.astype("string").tolist()
    val_texts[dataset] = df_val[dataset].text.astype("string").tolist()
    test_texts[dataset] = df_test[dataset].text.astype("string").tolist()

    train_labels[dataset] = df_train[dataset].label.tolist()
    val_labels[dataset] = df_val[dataset].label.tolist()
    test_labels[dataset] = df_test[dataset].label.tolist()

## Tokenize Texts

In [6]:
# import tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

# add special tokens for URLs, emojis and mentions (--> see pre-processing)
special_tokens_dict = {'additional_special_tokens': ['[USER]','[EMOJI]','[URL]']}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [7]:
# Tokenize text series for each dataset
train_encodings, val_encodings, test_encodings = {}, {}, {}

for dataset in df_raw:
    train_encodings[dataset] = tokenizer(train_texts[dataset], truncation=True, padding=True)
    val_encodings[dataset] = tokenizer(val_texts[dataset], truncation=True, padding=True)
    test_encodings[dataset] = tokenizer(test_texts[dataset], truncation=True, padding=True)

## Create PyTorch Datasets

In [8]:
class HateDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset, val_dataset, test_dataset = {}, {}, {}

for dataset in df_raw:
    train_dataset[dataset] = HateDataset(train_encodings[dataset], train_labels[dataset])
    val_dataset[dataset] = HateDataset(val_encodings[dataset], val_labels[dataset])
    test_dataset[dataset] = HateDataset(test_encodings[dataset], test_labels[dataset])

## Train Unweighted Multiclass Models

In [9]:
# check CUDA availability
print(torch.cuda.is_available())
print(torch.cuda.get_device_name())
print(torch.cuda.device_count(), 'GPUs')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

True
NVIDIA A100-SXM4-40GB
1 GPUs


device(type='cuda')

In [10]:
# Define training arguments, matching weighted binary model (for which we did hyperparameter tuning)
training_args = {}

for dataset in df_raw:
    training_args[dataset] = TrainingArguments(
        save_steps = 2500,
        output_dir='./Models/BERT_{}_multiclass/Checkpoints'.format(dataset), # output directory
        num_train_epochs=3,              # total number of training epochs
        per_device_train_batch_size=16,  # batch size per device during training
        per_device_eval_batch_size=64,   # batch size for evaluation
        evaluation_strategy = 'epoch',
        warmup_steps=500,                # number of warmup steps for learning rate scheduler
        weight_decay=0.01,               # strength of weight decay
        learning_rate = 5e-5,
        seed = 123
    )



In [11]:
# define explicit model initialisation. Different functions for each dataset to have correct number of labels (could be more elegant)
def model_init_D17(dataset):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
    # resize to match tokenizer length with special tokens added above
    model.resize_token_embeddings(len(tokenizer))
    return model

def model_init_F18(dataset):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)
    # resize to match tokenizer length with special tokens added above
    model.resize_token_embeddings(len(tokenizer))
    return model

In [12]:
# Instantiate trainer objects for each dataset
trainer = {}

for dataset in df_raw:
    if dataset == 'davidson2017':
        trainer[dataset] = Trainer(
            args=training_args[dataset],
            train_dataset=train_dataset[dataset],
            eval_dataset=val_dataset[dataset],
            model_init = model_init_D17
        )
    if dataset == 'founta2018':
        trainer[dataset] = Trainer(
            args=training_args[dataset],
            train_dataset=train_dataset[dataset],
            eval_dataset=val_dataset[dataset],
            model_init = model_init_F18
        )

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        # Map string labels to numeric values
        label_map = {'class1': 0, 'class2': 1, 'class3': 2}  # Update with your class labels
        self.labels = [label_map[label] for label in labels]  # Convert to numeric labels

    def __getitem__(self, idx):
        # Create tensor items
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])  # Ensure labels are tensors
        return item

    def __len__(self):
        return len(self.labels)

# Loop through datasets in the trainer
for dataset in trainer:
    print('Training multiclass {} BERT model'.format(dataset))
    try:
        trainer[dataset].train()
    except Exception as e:
        print(f"Error encountered while training {dataset}: {e}")


Training multiclass davidson2017 BERT model


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,0.3004,0.25769
2,0.2465,0.228273
3,0.1594,0.264626


Training multiclass founta2018 BERT model


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.4998,0.514836
2,0.4184,0.496716


Epoch,Training Loss,Validation Loss
1,0.4998,0.514836
2,0.4184,0.496716
3,0.3284,0.587559


## Save Model and Tokenizer

In [15]:
for dataset in trainer:
    trainer[dataset].save_model('./Models/BERT_{}_multiclass/Final'.format(dataset))
    tokenizer.save_pretrained('./Models/BERT_{}_multiclass/Final'.format(dataset))

## Reload Models
So that models can be evaluated on test set even after kernel resets

In [16]:
# load fine-tuned models
models = {}

for dataset in ['davidson2017','founta2018']:
        models[dataset] = BertForSequenceClassification.from_pretrained("./Models/BERT_{}_multiclass/Final".format(dataset))

In [17]:
# Instantiate trainer objects for each model (already fine-tuned so no longer necessary to specify training and eval data)
# output directory is redundant because there is no further training but needs to be specified anyway

trainer = {}

for model in models:
    trainer[model] = Trainer(
        model=models[model],
        args=TrainingArguments(
            output_dir='./Models/BERT_{}_multiclass/Test'.format(model),
            per_device_eval_batch_size = 64)
)

## Evaluate Models on Test Data

In [18]:
# Evaluate each model on its corresponding test set

results = {}

for dataset in trainer:
    print('Evaluating multiclass {} BERT model on test data'.format(dataset))
    results[dataset] = trainer[dataset].predict(test_dataset[dataset])
    for metric in results[dataset].metrics:
        print(metric, results[dataset].metrics['{}'.format(metric)])
    print()

Evaluating multiclass davidson2017 BERT model on test data


test_loss 0.28092578053474426
test_model_preparation_time 0.0036
test_runtime 8.8804
test_samples_per_second 279.155
test_steps_per_second 4.392

Evaluating multiclass founta2018 BERT model on test data


test_loss 0.579056441783905
test_model_preparation_time 0.0035
test_runtime 32.2982
test_samples_per_second 309.615
test_steps_per_second 4.861



In [19]:
# write predictions to series
pred_labels={}

for dataset in trainer:

    preds=[]

    for row in results[dataset][0]:
        preds.append(int(np.argmax(row)))

    pred_labels[dataset] = pd.Series(preds)

# print classification reports for each model

for dataset in trainer:
        print(dataset.upper(), 'multiclass')
        print(classification_report(test_labels[dataset],pred_labels[dataset]))
        print()

DAVIDSON2017 multiclass
              precision    recall  f1-score   support

           0       0.45      0.45      0.45       143
           1       0.95      0.94      0.94      1919
           2       0.88      0.89      0.88       417

    accuracy                           0.91      2479
   macro avg       0.76      0.76      0.76      2479
weighted avg       0.91      0.91      0.91      2479


FOUNTA2018 multiclass
              precision    recall  f1-score   support

           0       0.54      0.41      0.47       497
           1       0.85      0.93      0.89      2715
           2       0.85      0.86      0.86      5385
           3       0.61      0.52      0.56      1403

    accuracy                           0.81     10000
   macro avg       0.71      0.68      0.69     10000
weighted avg       0.80      0.81      0.81     10000




In [20]:
# f1 scores
for dataset in trainer:
        print(dataset.upper())
        for average in ['micro', 'macro', 'weighted']:
            print('{} F1 score: {:.2%}'.format(average, f1_score(test_labels[dataset],pred_labels[dataset], average=average)))
        print()

DAVIDSON2017
micro F1 score: 90.52%
macro F1 score: 75.94%
weighted F1 score: 90.55%

FOUNTA2018
micro F1 score: 81.09%
macro F1 score: 69.32%
weighted F1 score: 80.52%



In [21]:
for dataset in trainer:
        print(dataset.upper())
        print(pred_labels[dataset].value_counts())
        print()

DAVIDSON2017
1    1913
2     420
0     146
Name: count, dtype: int64

FOUNTA2018
2    5458
1    2961
3    1202
0     379
Name: count, dtype: int64



In [22]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Define the source directories (where the models are saved locally)
source_dirs = ['./Models/BERT_davidson2017_multiclass/Final',
               './Models/BERT_founta2018_multiclass/Final']

# Define the destination folder on Google Drive
drive_dest = '/content/drive/MyDrive/Trained_Models'

# Create the destination directory if it doesn't exist
os.makedirs(drive_dest, exist_ok=True)

# Copy the model folders to Google Drive
for src_dir in source_dirs:
    # Extract the dataset name from the directory path
    dataset_name = os.path.basename(os.path.dirname(src_dir))
    dest_dir = os.path.join(drive_dest, dataset_name)

    shutil.copytree(src_dir, dest_dir, dirs_exist_ok=True)
    print(f"Copied {src_dir} to {dest_dir}")

print("Models successfully saved to Google Drive!")


Mounted at /content/drive
Copied ./Models/BERT_davidson2017_multiclass/Final to /content/drive/MyDrive/Trained_Models/BERT_davidson2017_multiclass
Copied ./Models/BERT_founta2018_multiclass/Final to /content/drive/MyDrive/Trained_Models/BERT_founta2018_multiclass
Models successfully saved to Google Drive!


In [23]:
# -----
# Train RoBERTa models for each dataset

# Import RoBERTa tokenizer and model
from transformers import RobertaTokenizerFast, RobertaForSequenceClassification, Trainer, TrainingArguments

# Initialize RoBERTa tokenizer
roberta_tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base')

# Add special tokens for URLs, emojis, and mentions
special_tokens_dict = {'additional_special_tokens': ['[USER]', '[EMOJI]', '[URL]']}
roberta_tokenizer.add_special_tokens(special_tokens_dict)

# Tokenize text series for each dataset using RoBERTa tokenizer
train_encodings_roberta, val_encodings_roberta, test_encodings_roberta = {}, {}, {}

for dataset in df_raw:
    train_encodings_roberta[dataset] = roberta_tokenizer(train_texts[dataset], truncation=True, padding=True)
    val_encodings_roberta[dataset] = roberta_tokenizer(val_texts[dataset], truncation=True, padding=True)
    test_encodings_roberta[dataset] = roberta_tokenizer(test_texts[dataset], truncation=True, padding=True)

# Create datasets using RoBERTa encodings
train_dataset_roberta, val_dataset_roberta, test_dataset_roberta = {}, {}, {}

for dataset in df_raw:
    train_dataset_roberta[dataset] = HateDataset(train_encodings_roberta[dataset], train_labels[dataset])
    val_dataset_roberta[dataset] = HateDataset(val_encodings_roberta[dataset], val_labels[dataset])
    test_dataset_roberta[dataset] = HateDataset(test_encodings_roberta[dataset], test_labels[dataset])

# Define training arguments for RoBERTa models
training_args_roberta = {}

for dataset in df_raw:
    training_args_roberta[dataset] = TrainingArguments(
        save_steps=2500,
        output_dir='./Models/RoBERTa_{}_multiclass/Checkpoints'.format(dataset),  # Output directory
        num_train_epochs=3,                # Total number of training epochs
        per_device_train_batch_size=16,    # Batch size per device during training
        per_device_eval_batch_size=64,     # Batch size for evaluation
        evaluation_strategy='epoch',       # Evaluation strategy
        warmup_steps=500,                  # Number of warmup steps for learning rate scheduler
        weight_decay=0.01,                 # Strength of weight decay
        learning_rate=5e-5,                # Learning rate
        seed=123                            # Seed for reproducibility
    )

# Define model initialization functions for RoBERTa with correct number of labels
def model_init_roberta_D17():
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=3)
    model.resize_token_embeddings(len(roberta_tokenizer))
    return model

def model_init_roberta_F18():
    model = RobertaForSequenceClassification.from_pretrained("roberta-base", num_labels=4)
    model.resize_token_embeddings(len(roberta_tokenizer))
    return model

# Instantiate trainer objects for each dataset using RoBERTa
trainer_roberta = {}

for dataset in df_raw:
    if dataset == 'davidson2017':
        trainer_roberta[dataset] = Trainer(
            args=training_args_roberta[dataset],
            train_dataset=train_dataset_roberta[dataset],
            eval_dataset=val_dataset_roberta[dataset],
            model_init=model_init_roberta_D17
        )
    elif dataset == 'founta2018':
        trainer_roberta[dataset] = Trainer(
            args=training_args_roberta[dataset],
            train_dataset=train_dataset_roberta[dataset],
            eval_dataset=val_dataset_roberta[dataset],
            model_init=model_init_roberta_F18
        )

# Train RoBERTa models
for dataset in trainer_roberta:
    print('Training RoBERTa multiclass {} model'.format(dataset))
    try:
        trainer_roberta[dataset].train()
    except Exception as e:
        print(f"Error encountered while training {dataset} with RoBERTa: {e}")

# Save RoBERTa models
for dataset in trainer_roberta:
    trainer_roberta[dataset].save_model('./Models/RoBERTa_{}_multiclass/Final'.format(dataset))
    roberta_tokenizer.save_pretrained('./Models/RoBERTa_{}_multiclass/Final'.format(dataset))

print("RoBERTa models trained and saved successfully!")
# -----


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training RoBERTa multiclass davidson2017 model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.3355,0.274565
2,0.2754,0.25141
3,0.2055,0.250271


Training RoBERTa multiclass founta2018 model


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.5131,0.522834
2,0.4493,0.481955


Epoch,Training Loss,Validation Loss
1,0.5131,0.522834
2,0.4493,0.481955
3,0.3898,0.499763


RoBERTa models trained and saved successfully!
