## Setup and Imports

In [1]:
experiment = 'ISHate-lora'

In [2]:
import os

COLAB = False
if 'google.colab' in str(get_ipython()):
    COLAB = True

if COLAB:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    repo_path = '/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection'

    hf_token = userdata.get('hf_token')

else:
    repo_path = '..'

!python -m pip install transformers accelerate datasets evaluate peft bitsandbytes tqdm

data_path = os.path.join(repo_path, 'data/processed')

Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

from transformers import (
    BertForSequenceClassification,
    BertConfig,
    BertTokenizer,
    EvalPrediction,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig
)

from peft import (
    PeftModel,
    PeftConfig,
    PeftType,
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

import accelerate

import evaluate
from datasets import load_dataset
from datetime import datetime
from sklearn.metrics import classification_report
import time
import math

import bitsandbytes as bnb

In [4]:
# Path Definitions
exp_dir = os.path.join(repo_path, 'experiments', experiment)

model_dir = os.path.join(repo_path, f'models/hateBERT-{experiment}')
model_target = 'GroNLP/hateBERT'

train_file = os.path.join(data_path, 'ishate/ishate_train.csv')
val_file = os.path.join(data_path, 'ishate/ishate_val.csv')
test_file = os.path.join(data_path, 'ishate/ishate_test.csv')

results_file = os.path.join(exp_dir, 'results.csv')
metrics_file = os.path.join(exp_dir, 'metrics.csv')

## Load Data/Model/Tokenizer

In [5]:
data = load_dataset(
    "csv",
    data_files = {
        "train": train_file,
        "val": val_file,
    }
)

Generating train split: 0 examples [00:00, ? examples/s]

Generating val split: 0 examples [00:00, ? examples/s]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained(model_target, token=hf_token, max_length=512)

# set padding_side and truncation side to 'left', following hateBERT procedure
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding = 'max_length',
    max_length = 512,
)

tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

## Preprocess Data

In [7]:
def preprocess(example):
    encoded = tokenizer(
        example['cleaned_text'],
        add_special_tokens=True,
        padding='max_length'
    )

    return encoded

In [8]:
processed = data.map(preprocess)
processed.set_format("torch")

Map:   0%|          | 0/20381 [00:00<?, ? examples/s]

Map:   0%|          | 0/4367 [00:00<?, ? examples/s]

In [9]:
processed

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'cleaned_text', 'label_name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 20381
    })
    val: Dataset({
        features: ['id', 'text', 'cleaned_text', 'label_name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4367
    })
})

## Define model

In [10]:
peft_config = LoraConfig(
    task_type="SEQ_CLS",
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
)

In [11]:
model = BertForSequenceClassification.from_pretrained(
    model_target,
    num_labels=3,
    output_attentions=False,
    output_hidden_states=False,
    token=hf_token,
#    quantization_config=bnb_config
)

model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [12]:
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 297,219 || all params: 109,781,766 || trainable%: 0.27073621679578375



## Train setup

In [13]:
batch_size = 18
metric_name = "f1"

args = TrainingArguments(
    model_dir,
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    push_to_hub=False,
)

In [14]:
def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions

    y_pred = np.argmax(preds, axis=1).flatten()
    y_true = p.label_ids

    result = classification_report(y_pred, y_true, output_dict=True)
    result['f1'] = result['weighted avg']['f1-score']
    return result

## Train

In [15]:
trainer = Trainer(
    model,
    args,
    train_dataset=processed['train'],
    eval_dataset=processed['val'],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


## Run Fine-tuning

In [16]:
start = time.time()
trainer.train()
end = time.time()

print(f"Total training time: ~{(end - start) // 60} minutes")

Epoch,Training Loss,Validation Loss,0,1,2,Accuracy,Macro avg,Weighted avg,F1
1,0.7121,0.596912,"{'precision': 0.8458955223880597, 'recall': 0.8093538022134952, 'f1-score': 0.8272213099799306, 'support': 2801}","{'precision': 0.6995336442371752, 'recall': 0.6704980842911877, 'f1-score': 0.6847081838930551, 'support': 1566}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.75956,"{'precision': 0.515143055541745, 'recall': 0.4932839621682277, 'f1-score': 0.5039764979576619, 'support': 4367}","{'precision': 0.7934103606788118, 'recall': 0.7595603389054271, 'f1-score': 0.7761163052966132, 'support': 4367}",0.776116
2,0.5566,0.533762,"{'precision': 0.8824626865671642, 'recall': 0.8350988700564972, 'f1-score': 0.8581277213352686, 'support': 2832}","{'precision': 0.7514990006662225, 'recall': 0.7348534201954398, 'f1-score': 0.7430830039525692, 'support': 1535}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.799863,"{'precision': 0.5446538957444623, 'recall': 0.523317430083979, 'f1-score': 0.5337369084292792, 'support': 4367}","{'precision': 0.8364289659676805, 'recall': 0.799862605907946, 'f1-score': 0.8176895163473036, 'support': 4367}",0.81769
3,0.5421,0.518173,"{'precision': 0.8914179104477612, 'recall': 0.8356068555438965, 'f1-score': 0.8626105795269904, 'support': 2859}","{'precision': 0.7495003331112592, 'recall': 0.7460212201591512, 'f1-score': 0.7477567298105683, 'support': 1508}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.804671,"{'precision': 0.5469727478530068, 'recall': 0.5272093585676826, 'f1-score': 0.5367891031125196, 'support': 4367}","{'precision': 0.8424113369136542, 'recall': 0.8046713991298374, 'f1-score': 0.8229495753198998, 'support': 4367}",0.82295
4,0.5124,0.505975,"{'precision': 0.878731343283582, 'recall': 0.8459051724137931, 'f1-score': 0.8620058565153733, 'support': 2784}","{'precision': 0.7748167888074617, 'recall': 0.7346809854706254, 'f1-score': 0.7542153047989625, 'support': 1583}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.805587,"{'precision': 0.5511827106970145, 'recall': 0.5268620526281396, 'f1-score': 0.5387403871047786, 'support': 4367}","{'precision': 0.8410632096138548, 'recall': 0.805587359743531, 'f1-score': 0.822932707129736, 'support': 4367}",0.822933
5,0.5195,0.506587,"{'precision': 0.8522388059701492, 'recall': 0.8667931688804554, 'f1-score': 0.8594543744120414, 'support': 2635}","{'precision': 0.8167888074616922, 'recall': 0.707852193995381, 'f1-score': 0.758428703990102, 'support': 1732}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.803755,"{'precision': 0.5563425378106138, 'recall': 0.5248817876252788, 'f1-score': 0.5392943594673811, 'support': 4367}","{'precision': 0.8381789485356067, 'recall': 0.8037554385161438, 'f1-score': 0.8193864877230561, 'support': 4367}",0.819386
6,0.5072,0.491036,"{'precision': 0.8880597014925373, 'recall': 0.8454706927175843, 'f1-score': 0.8662420382165605, 'support': 2815}","{'precision': 0.7701532311792139, 'recall': 0.7448453608247423, 'f1-score': 0.7572879135276778, 'support': 1552}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.809709,"{'precision': 0.552737644223917, 'recall': 0.5301053511807755, 'f1-score': 0.5411766505814127, 'support': 4367}","{'precision': 0.8461566005247613, 'recall': 0.8097091825051522, 'f1-score': 0.8275205356937425, 'support': 4367}",0.827521
7,0.4896,0.487853,"{'precision': 0.8843283582089553, 'recall': 0.8494623655913979, 'f1-score': 0.8665447897623401, 'support': 2790}","{'precision': 0.7801465689540307, 'recall': 0.7425491439441978, 'f1-score': 0.760883690708252, 'support': 1577}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.810854,"{'precision': 0.5548249757209953, 'recall': 0.5306705031785319, 'f1-score': 0.5424761601568641, 'support': 4367}","{'precision': 0.8467064938501241, 'recall': 0.8108541332722693, 'f1-score': 0.8283887207886059, 'support': 4367}",0.828389
8,0.489,0.486409,"{'precision': 0.8828358208955224, 'recall': 0.851998559596687, 'f1-score': 0.8671431189298149, 'support': 2777}","{'precision': 0.7861425716189208, 'recall': 0.7421383647798742, 'f1-score': 0.763506955677774, 'support': 1590}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.811999,"{'precision': 0.5563261308381477, 'recall': 0.5313789747921871, 'f1-score': 0.5435500248691963, 'support': 4367}","{'precision': 0.8476303557364209, 'recall': 0.8119990840393863, 'f1-score': 0.8294097780617716, 'support': 4367}",0.82941
9,0.4837,0.486919,"{'precision': 0.8716417910447761, 'recall': 0.858192505510654, 'f1-score': 0.8648648648648649, 'support': 2722}","{'precision': 0.8007994670219853, 'recall': 0.7306990881458967, 'f1-score': 0.7641449459631279, 'support': 1645}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.810167,"{'precision': 0.5574804193555871, 'recall': 0.5296305312188502, 'f1-score': 0.5430032702759976, 'support': 4367}","{'precision': 0.8449562808507091, 'recall': 0.8101671628119991, 'f1-score': 0.826924799237808, 'support': 4367}",0.826925
10,0.4868,0.485097,"{'precision': 0.8776119402985074, 'recall': 0.8558951965065502, 'f1-score': 0.866617538688283, 'support': 2748}","{'precision': 0.7954696868754164, 'recall': 0.7374922791846819, 'f1-score': 0.7653846153846153, 'support': 1619}","{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}",0.811999,"{'precision': 0.5576938757246412, 'recall': 0.531129158563744, 'f1-score': 0.5440007180242995, 'support': 4367}","{'precision': 0.8471589271792072, 'recall': 0.8119990840393863, 'f1-score': 0.8290869449560554, 'support': 4367}",0.829087


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.8458955223880597, 'recall': 0.8093538022134952, 'f1-score': 0.8272213099799306, 'support': 2801}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.6995336442371752, 'recall': 0.6704980842911877, 'f1-score': 0.6847081838930551, 'support': 1566}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect s

Total training time: ~85.0 minutes


In [17]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Trainer is attempting to log a value of "{'precision': 0.8828358208955224, 'recall': 0.851998559596687, 'f1-score': 0.8671431189298149, 'support': 2777}" of type <class 'dict'> for key "eval/0" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.7861425716189208, 'recall': 0.7421383647798742, 'f1-score': 0.763506955677774, 'support': 1590}" of type <class 'dict'> for key "eval/1" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
Trainer is attempting to log a value of "{'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0}" of type <class 'dict'> for key "eval/2" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so 

{'eval_loss': 0.4864090085029602,
 'eval_0': {'precision': 0.8828358208955224,
  'recall': 0.851998559596687,
  'f1-score': 0.8671431189298149,
  'support': 2777},
 'eval_1': {'precision': 0.7861425716189208,
  'recall': 0.7421383647798742,
  'f1-score': 0.763506955677774,
  'support': 1590},
 'eval_2': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 0},
 'eval_accuracy': 0.8119990840393863,
 'eval_macro avg': {'precision': 0.5563261308381477,
  'recall': 0.5313789747921871,
  'f1-score': 0.5435500248691963,
  'support': 4367},
 'eval_weighted avg': {'precision': 0.8476303557364209,
  'recall': 0.8119990840393863,
  'f1-score': 0.8294097780617716,
  'support': 4367},
 'eval_f1': 0.8294097780617716,
 'eval_runtime': 45.3496,
 'eval_samples_per_second': 96.296,
 'eval_steps_per_second': 5.358,
 'epoch': 10.0}

## Save best model checkpoint

In [18]:
trainer.save_model(os.path.join(model_dir, 'final_model'))