In [1]:
experiment = 'ISHate-lora'

In [2]:
import os

COLAB = False
if 'google.colab' in str(get_ipython()):
    COLAB = True

if COLAB:
    from google.colab import drive, userdata
    drive.mount('/content/drive')
    repo_path = '/content/drive/Othercomputers/My Mac/266-implicit-hate-speech-detection'

    hf_token = userdata.get('hf_token')

else:
    repo_path = '..'

!python -m pip install transformers accelerate datasets evaluate peft bitsandbytes tqdm

data_path = os.path.join(repo_path, 'data/processed')

Mounted at /content/drive
Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft
  Downloading peft-0.10.0-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.1/199.1 kB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset

from transformers import (
    BertForSequenceClassification,
    BertConfig,
    BertTokenizer,
    EvalPrediction,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
    BitsAndBytesConfig,
    AutoModel,
    AutoTokenizer,
    AutoConfig
)

from peft import (
    PeftModel,
    PeftConfig,
    PeftType,
    prepare_model_for_kbit_training,
    LoraConfig,
    get_peft_model
)

import accelerate

from datasets import load_dataset
from datetime import datetime
from sklearn.metrics import classification_report
import time
import math

In [4]:
# Path Definitions
exp_dir = os.path.join(repo_path, 'experiments', f'hateBERT-{experiment}')

model_dir = os.path.join(repo_path, f'models/hateBERT-{experiment}')
model_target = 'GroNLP/hateBERT'

train_file = os.path.join(data_path, 'ishate/ishate_train.csv')
val_file = os.path.join(data_path, 'ishate/ishate_val.csv')
test_file = os.path.join(data_path, 'ishate/ishate_test.csv')

results_file = os.path.join(exp_dir, 'results.csv')
metrics_file = os.path.join(exp_dir, 'metrics.txt')

final_model = os.path.join(model_dir, f'final_model')

## Load Data/Model/Tokenizer

In [5]:
data = load_dataset(
    "csv",
    data_files = {
        "test": test_file,
    }
)

Generating test split: 0 examples [00:00, ? examples/s]

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

config = PeftConfig.from_pretrained(final_model)
base_model = BertForSequenceClassification.from_pretrained(config.base_model_name_or_path, num_labels=3, token=hf_token)

tokenizer = BertTokenizer.from_pretrained(config.base_model_name_or_path, token=hf_token, padding='max_length', max_length=512)
tokenizer.padding_side = 'left'
tokenizer.truncation_side = 'left'

config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/hateBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/151 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [7]:
model = PeftModel.from_pretrained(base_model, final_model)

In [8]:
model.to(device)

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

## Preprocess Data

In [9]:
def preprocess(example):
    encoded = tokenizer(
        example['cleaned_text'],
        add_special_tokens=True,
        padding='max_length'
    )

    return encoded

In [10]:
processed = data.map(preprocess)
processed.set_format("torch")

Map:   0%|          | 0/4368 [00:00<?, ? examples/s]

In [11]:
processed

DatasetDict({
    test: Dataset({
        features: ['id', 'text', 'cleaned_text', 'label_name', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4368
    })
})

## Run Classification Inference

In [12]:
input_ids = processed['test']['input_ids'].cuda()
attention_masks = processed['test']['attention_mask'].cuda()
text_ids = processed['test']['id'].cuda()
token_type_ids = processed['test']['token_type_ids'].cuda()

In [13]:
batch_size = 18

inference_data = TensorDataset(input_ids, attention_masks, text_ids, token_type_ids)
inference_sampler = SequentialSampler(inference_data)
inference_dataloader = DataLoader(inference_data, sampler=inference_sampler, batch_size=batch_size)

In [14]:
model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): BertForSequenceClassification(
      (bert): BertModel(
        (embeddings): BertEmbeddings(
          (word_embeddings): Embedding(30522, 768, padding_idx=0)
          (position_embeddings): Embedding(512, 768)
          (token_type_embeddings): Embedding(2, 768)
          (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (encoder): BertEncoder(
          (layer): ModuleList(
            (0-11): 12 x BertLayer(
              (attention): BertAttention(
                (self): BertSelfAttention(
                  (query): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): L

In [15]:
results = pd.DataFrame()

for batch in inference_dataloader:

    batch_results = {
        'id': [],
        'pred': [],
        'logits': []
    }


    batch = tuple(t.to(device) for t in batch)

    b_input_ids, b_input_mask, b_text_id, b_token_type_ids = batch

    with torch.no_grad():
        outputs = model(b_input_ids, token_type_ids=b_token_type_ids, attention_mask=b_input_mask)

    logits = outputs[0].detach().cpu().numpy()
    pred = np.argmax(logits, axis=1).flatten().tolist()
    ids = b_text_id.detach().cpu().tolist()

    batch_results['logits'].extend(logits.tolist())
    batch_results['id'].extend(ids)
    batch_results['pred'].extend(pred)

    concat = pd.DataFrame(batch_results)

    results = pd.concat([results, concat])

In [16]:
def label_name(x):
    if x == 0:
        return "Not HS"
    elif x == 1:
        return "Explicit HS"
    elif x == 2:
        return "Implicit HS"


In [17]:
data_df = data['test'].to_pandas()
data_df = data_df.merge(results, on='id')
data_df['pred_name'] = data_df['pred'].apply(label_name)

In [18]:
metrics = classification_report(data_df['label_name'], data_df['pred_name'])

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
print(metrics)

              precision    recall  f1-score   support

 Explicit HS       0.75      0.80      0.78      1501
 Implicit HS       0.00      0.00      0.00       186
      Not HS       0.86      0.89      0.88      2681

    accuracy                           0.82      4368
   macro avg       0.54      0.56      0.55      4368
weighted avg       0.79      0.82      0.80      4368



In [20]:
data_df.to_csv(results_file)

In [21]:
with open(metrics_file, "w") as f:
    f.write(metrics)

In [22]:
type(data_df['logits'][0])

list

In [23]:
data_df[data_df['label'] == 2][['id','text','label','pred','logits']]

Unnamed: 0,id,text,label,pred,logits
6,24754,rt dawa = > barbaric totalitarian political ...,2,1,"[-0.3646008372306824, 1.9602322578430176, -0.9..."
10,24758,"Of course seeing as you're one of them, you'd ...",2,0,"[2.14424991607666, -0.3251391649246216, -0.938..."
46,24794,if youre not white youre not american.,2,0,"[1.2663661241531372, 0.18692845106124878, -0.6..."
49,24797,disabled peoples existence is basically an ex...,2,1,"[-1.2601978778839111, 2.0723845958709717, -0.3..."
51,24799,The Holocaust cannot be remembered in the pres...,2,1,"[-0.07169503718614578, 1.4684762954711914, -0...."
...,...,...,...,...,...
4220,28968,95% of african-americans are criminals and 85%...,2,1,"[0.13529284298419952, 1.479563593864441, -1.02..."
4235,28983,In Germany refugees can vote just for fun in s...,2,0,"[2.431018590927124, -0.6066387891769409, -1.03..."
4237,28985,Do you have any proof that the Holocaust ever ...,2,0,"[1.698941946029663, -0.3656098246574402, -0.32..."
4312,29060,it is a shame that we have to pay for the care...,2,1,"[0.993309736251831, 1.007383942604065, -1.3234..."


In [24]:
data_df[data_df['id'] == 29111]['text'].values

array(['118,000 seats in our #Texas Stadium..for the #OctoberRally yah #RedTsunami#MAGA #GOP#TRUMP#TEDCRUZ #2A @NRA#BUILDTHATWALL'],
      dtype=object)

In [25]:
data_df

Unnamed: 0,id,text,cleaned_text,label_name,label,pred,logits,pred_name
0,24748,Could some Tell the #Democrat Idiot Cortez th...,could some tell the democrat idiot cortez that...,Explicit HS,1,0,"[1.782021403312683, 0.2996269464492798, -1.369...",Not HS
1,24749,Gypsies use their children as beggars.,gypsies use their children as beggars.,Explicit HS,1,1,"[-0.9808593988418579, 2.6070940494537354, -1.1...",Explicit HS
2,24750,Cruel and stupid! How could a country like #Jo...,cruel and stupid! how could a country like jor...,Not HS,0,0,"[1.0630589723587036, 0.9410461187362671, -1.29...",Not HS
3,24751,I mean whites are offsprings of Homo sapien & ...,i mean whites are offsprings of homo sapien & ...,Not HS,0,0,"[0.7640034556388855, 0.6009020805358887, -0.67...",Not HS
4,24752,A Muslim should not be allowed to vote,a muslim should not be allowed to vote,Explicit HS,1,1,"[-1.2544529438018799, 2.4113898277282715, -0.1...",Explicit HS
...,...,...,...,...,...,...,...,...
4363,29111,"118,000 seats in our #Texas Stadium..for the #...","118,000 seats in our texas stadium..for the oc...",Explicit HS,1,0,"[2.2649762630462646, 0.17940787971019745, -1.8...",Not HS
4364,29112,Women are essentially childlike and don't have...,women are essentially childlike and don't have...,Explicit HS,1,1,"[-1.0601428747177124, 2.386565685272217, -0.84...",Explicit HS
4365,29113,"@FoxBusiness @Jim_Jordan Do it now, build the ...","@user @user do it now, build the wall before a...",Explicit HS,1,1,"[-0.9114248752593994, 2.455467939376831, -1.01...",Explicit HS
4366,29114,Someone can correct me if I 'm wrong,someone can correct me if i 'm wrong,Not HS,0,0,"[3.088304042816162, -1.2966740131378174, -1.25...",Not HS
