In [1]:
experiment = 'ISHate-lora-back-translation'

In [7]:
import os

COLAB = False
if 'google.colab' in str(get_ipython()):
    COLAB = True

if COLAB:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    repo_path = '/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection'

    hf_token = userdata.get('hf_token')

else:
    repo_path = '..'

!python -m pip install transformers accelerate datasets evaluate peft bitsandbytes tqdm

data_path = os.path.join(repo_path, 'data/processed')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

from transformers import (
    BertForSequenceClassification,
    BertConfig,
    BertTokenizer,
    EvalPrediction,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    AutoModel,
    AutoTokenizer,
    AutoConfig
)

from peft import (
    PeftModel,
    PeftConfig,
    PeftType,
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

import accelerate

from datasets import load_dataset
from datetime import datetime
from sklearn.metrics import classification_report
import time
import math
import tqdm

In [9]:
# Path Definitions
exp_dir = os.path.join(repo_path, 'experiments', f'hateBERT-{experiment}')

model_dir = os.path.join(repo_path, f'models/hateBERT-{experiment}')
model_target = 'GroNLP/hateBERT'

train_file = os.path.join(data_path, 'ishate/ishate_train.csv')
val_file = os.path.join(data_path, 'ishate/ishate_val.csv')
test_file = os.path.join(data_path, 'ishate/ishate_test.csv')

results_file = os.path.join(exp_dir, 'results.csv')
metrics_file = os.path.join(exp_dir, 'metrics.txt')

final_model = os.path.join(model_dir, f'final_model')

## Load Data/Model/Tokenizer

In [10]:
data = load_dataset(
    "csv",
    data_files = {
        "test": test_file,
    }
)

Generating test split: 0 examples [00:00, ? examples/s]

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = PeftConfig.from_pretrained(final_model)
base_model = BertForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=3, token=hf_token)

tokenizer = BertTokenizer.from_pretrained(config.base_model_name_or_path, token=hf_token, padding='max_length', max_length=512)
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [12]:
model = PeftModel.from_pretrained(base_model, final_model)

In [13]:
model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

## Preprocess Data

In [14]:
def preprocess(example):
    encoded = tokenizer(
        example['cleaned_text'],
        add_special_tokens=True,
        padding='max_length'
    )

    return encoded

In [15]:
processed = data.map(preprocess)
processed.set_format("torch")

Map:   0%|          | 0/4368 [00:00<?, ? examples/s]

In [16]:
processed

DatasetDict({
    test: Dataset({
        features: ['id', 'text', 'cleaned_text', 'label_name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4368
    })
})

## Run Classification Inference

In [17]:
input_ids = processed['test']['input_ids'].cuda()
attention_masks = processed['test']['attention_mask'].cuda()
text_ids = processed['test']['id'].cuda()
token_type_ids = processed['test']['token_type_ids'].cuda()

In [18]:
batch_size = 20

inference_data = TensorDataset(input_ids, attention_masks, text_ids, token_type_ids)
inference_sampler = SequentialSampler(inference_data)
inference_dataloader = DataLoader(inference_data, sampler=inference_sampler, batch_size=batch_size)

In [19]:
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

In [20]:
results = pd.DataFrame()

from tqdm import tqdm

for batch in tqdm(inference_dataloader):

    batch_results = {
        'id': [],
        'pred': [],
        'logits': []
    }


    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_text_id, b_token_type_ids = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)

    logits = outputs[0].detach().cpu().numpy()
    pred = np.argmax(logits, axis=1).flatten().tolist()
    ids = b_text_id.detach().cpu().tolist()

    batch_results['logits'].extend(logits.tolist())
    batch_results['id'].extend(ids)
    batch_results['pred'].extend(pred)

    concat = pd.DataFrame(batch_results)

    results = pd.concat([results, concat])

100%|██████████| 219/219 [00:43<00:00,  5.00it/s]


In [21]:
def label_name(x):
    if x == 0:
        return "Not HS"
    elif x == 1:
        return "Explicit HS"
    elif x == 2:
        return "Implicit HS"


In [22]:
data_df = data['test'].to_pandas()
data_df = data_df.merge(results, on='id')
data_df['pred_name'] = data_df['pred'].apply(label_name)

In [23]:
metrics = classification_report(data_df['label_name'], data_df['pred_name'])

In [24]:
print(metrics)

              precision    recall  f1-score   support

 Explicit HS       0.68      0.84      0.75      1501
 Implicit HS       0.31      0.08      0.13       186
      Not HS       0.88      0.81      0.85      2681

    accuracy                           0.79      4368
   macro avg       0.62      0.58      0.58      4368
weighted avg       0.79      0.79      0.79      4368



In [25]:
data_df.to_csv(results_file)

In [26]:
with open(metrics_file, "w") as f:
    f.write(metrics)