## Setup and Imports

In [1]:
experiment = 'ISHate-lora-EDA'

In [2]:
import os

COLAB = False
if 'google.colab' in str(get_ipython()):
    COLAB = True

if COLAB:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    repo_path = '/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection'

    hf_token = userdata.get('hf_token')

else:
    repo_path = '..'

!python -m pip install transformers accelerate datasets evaluate peft bitsandbytes tqdm

data_path = os.path.join(repo_path, 'data/processed')
aug_path = os.path.join(repo_path, 'data/easy_data_augmentation')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

from transformers import (
    BertForSequenceClassification,
    BertConfig,
    BertTokenizer,
    EvalPrediction,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig
)

from peft import (
    PeftModel,
    PeftConfig,
    PeftType,
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

import accelerate

import evaluate
from datasets import load_dataset
from datetime import datetime
from sklearn.metrics import classification_report
import time
import math

import bitsandbytes as bnb

In [4]:
# Path Definitions
exp_dir = os.path.join(repo_path, 'experiments', experiment)

model_dir = os.path.join(repo_path, f'models/hateBERT-{experiment}')
model_target = 'GroNLP/hateBERT'

train_file = os.path.join(aug_path, 'ishate/ishate_train.csv')
val_file = os.path.join(data_path, 'ishate/ishate_val.csv')
test_file = os.path.join(data_path, 'ishate/ishate_test.csv')

results_file = os.path.join(exp_dir, 'results.csv')
metrics_file = os.path.join(exp_dir, 'metrics.csv')

## Load Data/Model/Tokenizer

In [19]:
data = load_dataset(
    "csv",
    data_files = {
        "train": train_file,
    }
)

val = load_dataset(
    'csv',
    data_files = {
        "val": val_file,
    }
)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained(model_target, token=hf_token, max_length=512)

# set padding_side and truncation side to 'left', following hateBERT procedure
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding = 'max_length',
    max_length = 512,
)

## Preprocess Data

In [7]:
def preprocess(example):
    encoded = tokenizer(
        example['cleaned_text'],
        add_special_tokens=True,
        padding='max_length'
    )

    return encoded

In [8]:
processed = data.map(preprocess)
processed.set_format("torch")

processed_val = val.map(preprocess)
processed_val.set_format("torch")

In [9]:
processed

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'text', 'cleaned_text', 'label_name', 'label', 'orig_id', 'orig_cleaned_text', 'aug_method', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 21021
    })
})

## Define model

In [10]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
)

In [11]:
model = BertForSequenceClassification.from_pretrained(
    model_target,
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
    token=hf_token,
#    quantization_config=bnb_config
)

model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375



## Train setup

In [13]:
batch_size = 18
metric_name = "f1"

args = TrainingArguments(
    model_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

In [14]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions

    y_pred = np.argmax(preds, axis=1).flatten()
    y_true = p.label_ids

    result = classification_report(y_pred, y_true, output_dict=True)
    result['f1'] = result['weighted avg']['f1-score']
    return result

## Train

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=processed['train'],
    eval_dataset=processed_val['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Run Fine-tuning

In [16]:
start = time.time()
trainer.train()
end = time.time()

print(f"Total training time: ~{(end - start) // 60} minutes")

Epoch,Training Loss,Validation Loss,0,1,2,Accuracy,Macro avg,Weighted avg,F1
1,0.6257,0.723278,"{'precision': 0.6697761194029851, 'recall': 0.8592628051699378, 'f1-score': 0.7527783602432375, 'support': 2089}","{'precision': 0.8307794803464357, 'recall': 0.556945064761054, 'f1-score': 0.6668449197860962, 'support': 2239}","{'precision': 0.021505376344086023, 'recall': 0.10256410256410256, 'f1-score': 0.035555555555555556, 'support': 39}",0.697504,"{'precision': 0.5073536586978356, 'recall': 0.5062573241650314, 'f1-score': 0.4850596118616297, 'support': 4367}","{'precision': 0.7465345270450938, 'recall': 0.6975040073276849, 'f1-score': 0.7023142744712294, 'support': 4367}",0.702314
2,0.4591,0.599681,"{'precision': 0.7809701492537313, 'recall': 0.861671469740634, 'f1-score': 0.8193384223918575, 'support': 2429}","{'precision': 0.8094603597601598, 'recall': 0.6364588789942378, 'f1-score': 0.7126099706744868, 'support': 1909}","{'precision': 0.026881720430107527, 'recall': 0.1724137931034483, 'f1-score': 0.046511627906976744, 'support': 29}",0.758644,"{'precision': 0.5391040764813329, 'recall': 0.55684804727944, 'f1-score': 0.5261533403244404, 'support': 4367}","{'precision': 0.7884167367098539, 'recall': 0.7586443782917335, 'f1-score': 0.7675507898366658, 'support': 4367}",0.767551
3,0.4337,0.557918,"{'precision': 0.8223880597014925, 'recall': 0.8616106333072713, 'f1-score': 0.8415425735013363, 'support': 2558}","{'precision': 0.8047968021319121, 'recall': 0.6756152125279642, 'f1-score': 0.7345697780480389, 'support': 1788}","{'precision': 0.026881720430107527, 'recall': 0.23809523809523808, 'f1-score': 0.04830917874396136, 'support': 21}",0.782459,"{'precision': 0.5513555274211707, 'recall': 0.5917736946434912, 'f1-score': 0.5414738434311122, 'support': 4367}","{'precision': 0.811360168320886, 'recall': 0.7824593542477674, 'f1-score': 0.7939297364139993, 'support': 4367}",0.79393
4,0.4219,0.541716,"{'precision': 0.8384328358208956, 'recall': 0.8612495208892296, 'f1-score': 0.8496880317640386, 'support': 2609}","{'precision': 0.8001332445036642, 'recall': 0.6926182237600923, 'f1-score': 0.7425038639876353, 'support': 1734}","{'precision': 0.026881720430107527, 'recall': 0.20833333333333334, 'f1-score': 0.047619047619047616, 'support': 24}",0.790703,"{'precision': 0.5551492669182224, 'recall': 0.5874003593275517, 'f1-score': 0.5466036477902405, 'support': 4367}","{'precision': 0.8187651650827552, 'recall': 0.7907029997710099, 'f1-score': 0.8027205477833281, 'support': 4367}",0.802721
5,0.4208,0.563604,"{'precision': 0.7992537313432836, 'recall': 0.8764320785597381, 'f1-score': 0.836065573770492, 'support': 2444}","{'precision': 0.832111925383078, 'recall': 0.6591029023746702, 'f1-score': 0.7355712603062426, 'support': 1895}","{'precision': 0.03225806451612903, 'recall': 0.21428571428571427, 'f1-score': 0.056074766355140186, 'support': 28}",0.77788,"{'precision': 0.5545412404141635, 'recall': 0.5832735650733741, 'f1-score': 0.5425705334772916, 'support': 4367}","{'precision': 0.8085943310763384, 'recall': 0.7778795511792993, 'f1-score': 0.7874563531104548, 'support': 4367}",0.787456
6,0.4039,0.566056,"{'precision': 0.7847014925373135, 'recall': 0.8810222036028488, 'f1-score': 0.8300769686204855, 'support': 2387}","{'precision': 0.8427714856762158, 'recall': 0.652734778121775, 'f1-score': 0.7356789764466415, 'support': 1938}","{'precision': 0.053763440860215055, 'recall': 0.23809523809523808, 'f1-score': 0.08771929824561403, 'support': 42}",0.773529,"{'precision': 0.5604121396912481, 'recall': 0.5906174066066207, 'f1-score': 0.5511584144375804, 'support': 4367}","{'precision': 0.8034421036050384, 'recall': 0.7735287382642546, 'f1-score': 0.781045063196017, 'support': 4367}",0.781045
7,0.398,0.56837,"{'precision': 0.783955223880597, 'recall': 0.8846315789473684, 'f1-score': 0.8312561819980218, 'support': 2375}","{'precision': 0.8507661558960693, 'recall': 0.6515306122448979, 'f1-score': 0.7379370124241549, 'support': 1960}","{'precision': 0.04838709677419355, 'recall': 0.28125, 'f1-score': 0.08256880733944953, 'support': 32}",0.77559,"{'precision': 0.5610361588502867, 'recall': 0.6058040637307555, 'f1-score': 0.550587333920542, 'support': 4367}","{'precision': 0.808551341737918, 'recall': 0.7755896496450653, 'f1-score': 0.7838864617429603, 'support': 4367}",0.783886
8,0.3949,0.540178,"{'precision': 0.8119402985074626, 'recall': 0.8795472918350848, 'f1-score': 0.8443927046953822, 'support': 2474}","{'precision': 0.8374417055296469, 'recall': 0.6750805585392051, 'f1-score': 0.7475468331846565, 'support': 1862}","{'precision': 0.053763440860215055, 'recall': 0.3225806451612903, 'f1-score': 0.09216589861751152, 'support': 31}",0.788413,"{'precision': 0.5677151482991082, 'recall': 0.6257361651785268, 'f1-score': 0.5613684788325167, 'support': 4367}","{'precision': 0.8174315138242115, 'recall': 0.7884130982367759, 'f1-score': 0.7977597658949733, 'support': 4367}",0.79776
9,0.3969,0.551182,"{'precision': 0.7977611940298508, 'recall': 0.8831061544816192, 'f1-score': 0.8382670064693198, 'support': 2421}","{'precision': 0.8461025982678214, 'recall': 0.662839248434238, 'f1-score': 0.7433421129645887, 'support': 1916}","{'precision': 0.053763440860215055, 'recall': 0.3333333333333333, 'f1-score': 0.09259259259259259, 'support': 30}",0.782688,"{'precision': 0.5658757443859624, 'recall': 0.6264262454163968, 'f1-score': 0.558067237342167, 'support': 4367}","{'precision': 0.8138597051186676, 'recall': 0.7826883444011907, 'f1-score': 0.791496608399394, 'support': 4367}",0.791497
10,0.3936,0.546618,"{'precision': 0.8011194029850747, 'recall': 0.8828125, 'f1-score': 0.839984350547731, 'support': 2432}","{'precision': 0.8454363757495004, 'recall': 0.6685985247629084, 'f1-score': 0.7466902030008826, 'support': 1898}","{'precision': 0.053763440860215055, 'recall': 0.2702702702702703, 'f1-score': 0.08968609865470853, 'support': 37}",0.78452,"{'precision': 0.5667730731982633, 'recall': 0.6072270983443929, 'f1-score': 0.558786884067774, 'support': 4367}","{'precision': 0.814048517642336, 'recall': 0.784520265628578, 'f1-score': 0.7930795354884318, 'support': 4367}",0.79308


Trainer is attempting to log a value of "{'precision': 0.6697761194029851, 'recall': 0.8592628051699378, 'f1-score': 0.7527783602432375, 'support': 2089}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8307794803464357, 'recall': 0.556945064761054, 'f1-score': 0.6668449197860962, 'support': 2239}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.021505376344086023, 'recall': 0.10256410256410256, 'f1-score': 0.035555555555555556, 'support': 39}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.5073536586978356, 'recall': 0

Total training time: ~85.0 minutes


In [17]:
trainer.evaluate()

Trainer is attempting to log a value of "{'precision': 0.8384328358208956, 'recall': 0.8612495208892296, 'f1-score': 0.8496880317640386, 'support': 2609}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.8001332445036642, 'recall': 0.6926182237600923, 'f1-score': 0.7425038639876353, 'support': 1734}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.026881720430107527, 'recall': 0.20833333333333334, 'f1-score': 0.047619047619047616, 'support': 24}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.5551492669182224, 'recall': 

{'eval_loss': 0.5417163372039795,
 'eval_0': {'precision': 0.8384328358208956,
  'recall': 0.8612495208892296,
  'f1-score': 0.8496880317640386,
  'support': 2609},
 'eval_1': {'precision': 0.8001332445036642,
  'recall': 0.6926182237600923,
  'f1-score': 0.7425038639876353,
  'support': 1734},
 'eval_2': {'precision': 0.026881720430107527,
  'recall': 0.20833333333333334,
  'f1-score': 0.047619047619047616,
  'support': 24},
 'eval_accuracy': 0.7907029997710099,
 'eval_macro avg': {'precision': 0.5551492669182224,
  'recall': 0.5874003593275517,
  'f1-score': 0.5466036477902405,
  'support': 4367},
 'eval_weighted avg': {'precision': 0.8187651650827552,
  'recall': 0.7907029997710099,
  'f1-score': 0.8027205477833281,
  'support': 4367},
 'eval_f1': 0.8027205477833281,
 'eval_runtime': 44.2462,
 'eval_samples_per_second': 98.698,
 'eval_steps_per_second': 5.492,
 'epoch': 10.0}

## Save best model checkpoint

In [18]:
trainer.save_model(os.path.join(model_dir, 'final_model'))