In [1]:
import json, math, random
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from datasets import Dataset, load_metric

### Downloading

In [2]:
file = './Data/HateXplain.json'
with open(file) as train_file:
    dict_train = json.load(train_file)

### Selection of the random label from three annotators

In [3]:
def annotatorSelect(value):
    return math.floor(random.uniform(0, 1)*len(value))

In [4]:
data = []

for key, value in dict_train.items():
    text = ' '.join(value['post_tokens']) 
    label = value['annotators'][annotatorSelect(value['annotators'])]['label']
    data.append({'text': text, 'label': label})

In [5]:
data

[{'text': 'i dont think im getting my baby them white 9 he has two white j and nikes not even touched',
  'label': 'normal'},
 {'text': 'we cannot continue calling ourselves feminists if the rights of all womxn arent addressed yes to a sexual offences public list but will a trans lesbian bisexual and queer womxn be able to enter their information on the reporting sheet gender forum',
  'label': 'normal'},
 {'text': 'nawt yall niggers ignoring me', 'label': 'normal'},
 {'text': '<user> i am bit confused coz chinese ppl can not access twitter thn how this ching chong using it i think he pakistani 🤔 🤔 🤔',
  'label': 'offensive'},
 {'text': 'this bitch in whataburger eating a burger with no top bun and holding it palm up i hate white bitches',
  'label': 'hatespeech'},
 {'text': 'laura loomer raped me while screaming at me in her disgusting kike language and said we must exterminate the goyim laura loomer loomergate',
  'label': 'hatespeech'},
 {'text': 'and this is why i end up with nigge

### Creating Data Frame

In [6]:
df = pd.DataFrame(data)

In [7]:
df

Unnamed: 0,text,label
0,i dont think im getting my baby them white 9 h...,normal
1,we cannot continue calling ourselves feminists...,normal
2,nawt yall niggers ignoring me,normal
3,<user> i am bit confused coz chinese ppl can n...,offensive
4,this bitch in whataburger eating a burger with...,hatespeech
...,...,...
20143,if ur still on twitter tell carlton i said his...,offensive
20144,when i first got on here and said i hate trump...,offensive
20145,was macht der moslem wenn der zion gegen seine...,normal
20146,it is awful look at world demographics asians ...,hatespeech


In [8]:
label_mapping = {'hatespeech': 2, 'offensive': 1, 'normal': 0}  
df['label'] = df['label'].map(label_mapping)

In [9]:
df

Unnamed: 0,text,label
0,i dont think im getting my baby them white 9 h...,0
1,we cannot continue calling ourselves feminists...,0
2,nawt yall niggers ignoring me,0
3,<user> i am bit confused coz chinese ppl can n...,1
4,this bitch in whataburger eating a burger with...,2
...,...,...
20143,if ur still on twitter tell carlton i said his...,1
20144,when i first got on here and said i hate trump...,1
20145,was macht der moslem wenn der zion gegen seine...,0
20146,it is awful look at world demographics asians ...,2


### Spliting Data Frame 

In [10]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [11]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

### Tokenization

In [12]:
import torch

In [13]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [15]:
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True)

In [16]:
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/16118 [00:00<?, ? examples/s]

Map:   0%|          | 0/4030 [00:00<?, ? examples/s]

In [17]:
tokenized_train 

Dataset({
    features: ['text', 'label', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 16118
})

###  BERT model

In [18]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=3e-5,  
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs = 1, 
    weight_decay=0.01,
)



In [20]:
metric = load_metric("accuracy", trust_remote_code=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy", trust_remote_code=True)


In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    compute_metrics=compute_metrics,
)

In [23]:
trainer.train()

  0%|          | 0/1008 [00:00<?, ?it/s]

KeyboardInterrupt: 