In [1]:
experiment = 'IHC-lora-EDA'

In [2]:
import os

COLAB = False
if 'google.colab' in str(get_ipython()):
    COLAB = True

if COLAB:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    repo_path = '/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection'

    hf_token = userdata.get('hf_token')

else:
    repo_path = '..'

!python -m pip install transformers accelerate datasets evaluate peft bitsandbytes tqdm

data_path = os.path.join(repo_path, 'data/processed')
aug_path = os.path.join(repo_path, 'data/easy_data_augmentation')

Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

In [18]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

from transformers import (
    BertForSequenceClassification,
    BertConfig,
    BertTokenizer,
    EvalPrediction,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    AutoModel,
    AutoTokenizer,
    AutoConfig
)

from peft import (
    PeftModel,
    PeftConfig,
    PeftType,
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

import accelerate

from datasets import load_dataset
from datetime import datetime
from sklearn.metrics import classification_report
import time
import math
from tqdm import tqdm

In [4]:
# Path Definitions
exp_dir = os.path.join(repo_path, 'experiments', f'hateBERT-{experiment}')

model_dir = os.path.join(repo_path, f'models/hateBERT-{experiment}')
model_target = 'GroNLP/hateBERT'

train_file = os.path.join(data_path, 'ihc/ihc_train.csv')
val_file = os.path.join(data_path, 'ihc/ihc_val.csv')
test_file = os.path.join(data_path, 'ihc/ihc_test.csv')

results_file = os.path.join(exp_dir, 'results.csv')
metrics_file = os.path.join(exp_dir, 'metrics.txt')

final_model = os.path.join(model_dir, f'final_model')

## Load Data/Model/Tokenizer

In [5]:
data = load_dataset(
    "csv",
    data_files = {
        "test": test_file,
    }
)

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = PeftConfig.from_pretrained(final_model)
base_model = BertForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=3, token=hf_token)

tokenizer = BertTokenizer.from_pretrained(config.base_model_name_or_path, token=hf_token, padding='max_length', max_length=512)
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
model = PeftModel.from_pretrained(base_model, final_model)

In [8]:
model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

## Preprocess Data

In [9]:
def preprocess(example):
    encoded = tokenizer(
        example['cleaned_text'],
        add_special_tokens=True,
        padding='max_length'
    )

    return encoded

In [10]:
processed = data.map(preprocess)
processed.set_format("torch")

Map:   0%|          | 0/3222 [00:00<?, ? examples/s]

In [11]:
processed

DatasetDict({
    test: Dataset({
        features: ['id', 'text', 'cleaned_text', 'label_name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3222
    })
})

## Run Classification Inference

In [12]:
input_ids = processed['test']['input_ids'].cuda()
attention_masks = processed['test']['attention_mask'].cuda()
text_ids = processed['test']['id'].cuda()
token_type_ids = processed['test']['token_type_ids'].cuda()

In [13]:
batch_size = 18

inference_data = TensorDataset(input_ids, attention_masks, text_ids, token_type_ids)
inference_sampler = SequentialSampler(inference_data)
inference_dataloader = DataLoader(inference_data, sampler=inference_sampler, batch_size=batch_size)

In [14]:
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

In [20]:
results = pd.DataFrame()

for batch in tqdm(inference_dataloader):

    batch_results = {
        'id': [],
        'pred': [],
        'logits': []
    }


    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_text_id, b_token_type_ids = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)

    logits = outputs[0].detach().cpu().numpy()
    pred = np.argmax(logits, axis=1).flatten().tolist()
    ids = b_text_id.detach().cpu().tolist()

    batch_results['logits'].extend(logits.tolist())
    batch_results['id'].extend(ids)
    batch_results['pred'].extend(pred)

    concat = pd.DataFrame(batch_results)

    results = pd.concat([results, concat])

100%|██████████| 179/179 [01:45<00:00,  1.69it/s]


In [21]:
def label_name(x):
    if x == 0:
        return "Not HS"
    elif x == 1:
        return "Explicit HS"
    elif x == 2:
        return "Implicit HS"


In [22]:
data_df = data['test'].to_pandas()
data_df = data_df.merge(results, on='id')
data_df['pred_name'] = data_df['pred'].apply(label_name)

In [23]:
metrics = classification_report(data_df['label_name'], data_df['pred_name'])

In [24]:
print(metrics)

              precision    recall  f1-score   support

 Explicit HS       0.41      0.05      0.10       164
 Implicit HS       0.47      0.69      0.56      1065
      Not HS       0.79      0.65      0.71      1993

    accuracy                           0.63      3222
   macro avg       0.56      0.47      0.46      3222
weighted avg       0.67      0.63      0.63      3222



In [25]:
data_df.to_csv(results_file)

In [26]:
with open(metrics_file, "w") as f:
    f.write(metrics)