In [None]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader

In [121]:
df = pd.read_parquet('./data/train-data.parquet')
df_trump = pd.read_json('./data/tweets.json')
display(df.head())
display(df_trump.head())

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


Unnamed: 0,date,favorites,id,isRetweet,retweets,text
0,2024-11-04 10:20:38,3983,113425359755307360,False,1554,<p></p>
1,2024-11-04 10:20:27,2527,113425359034120560,False,713,"<p>Join me live in Raleigh, North Carolina at ..."
2,2024-11-04 09:56:53,4816,113425266314029296,False,1329,"<p><a href=""https://links.truthsocial.com/link..."
3,2024-11-04 09:36:27,6672,113425185981054000,False,2232,<p>MAKE AMERICA GREAT AGAIN!</p>
4,2024-11-04 09:17:13,5338,113425110394877856,False,2052,<p>“AMERICA IS BEING INVADED”</p>


In [None]:
# map all columns with true/false labels to 1 for true and 0 for false
def map_labels(df):
    for col in df.columns:
        if df[col].dtype == 'bool':
            df[col] = df[col].astype(int)
        elif np.issubdtype(df[col].dtype, np.floating):
            df[col] = (df[col] > 2).astype(int)
    return df

# drop all columns with annotator in the name
df = df.loc[:, ~df.columns.str.contains('annotator')]
df = df.drop(columns=["infitms", "outfitms", "std_err", "hypothesis", "platform", "hate_speech_score"])
df = map_labels(df)
# train test split
df_train, df_test = train_test_split(df, test_size=0.4, random_state=42)
df_test, df_dev = train_test_split(df_test, test_size=0.5, random_state=42)
# df_extra, df_train = train_test_split(df, test_size=0.01, random_state=42)
# df_train, df_test = train_test_split(df_train, test_size=0.4, random_state=42)
# df_test, df_dev = train_test_split(df_test, test_size=0.5, random_state=42)
df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train),
    "test": Dataset.from_pandas(df_test),
    "dev": Dataset.from_pandas(df_dev)
})

In [123]:
dataset

DatasetDict({
    train: Dataset({
        features: ['comment_id', 'sentiment', 'respect', 'insult', 'humiliate', 'status', 'dehumanize', 'violence', 'genocide', 'attack_defend', 'hatespeech', 'text', 'target_race_asian', 'target_race_black', 'target_race_latinx', 'target_race_middle_eastern', 'target_race_native_american', 'target_race_pacific_islander', 'target_race_white', 'target_race_other', 'target_race', 'target_religion_atheist', 'target_religion_buddhist', 'target_religion_christian', 'target_religion_hindu', 'target_religion_jewish', 'target_religion_mormon', 'target_religion_muslim', 'target_religion_other', 'target_religion', 'target_origin_immigrant', 'target_origin_migrant_worker', 'target_origin_specific_country', 'target_origin_undocumented', 'target_origin_other', 'target_origin', 'target_gender_men', 'target_gender_non_binary', 'target_gender_transgender_men', 'target_gender_transgender_unspecified', 'target_gender_transgender_women', 'target_gender_women', 'target_g

In [124]:
labels = [label for label in dataset["train"].features.keys() if label not in ['comment_id', 'text']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels

['sentiment',
 'respect',
 'insult',
 'humiliate',
 'status',
 'dehumanize',
 'violence',
 'genocide',
 'attack_defend',
 'hatespeech',
 'target_race_asian',
 'target_race_black',
 'target_race_latinx',
 'target_race_middle_eastern',
 'target_race_native_american',
 'target_race_pacific_islander',
 'target_race_white',
 'target_race_other',
 'target_race',
 'target_religion_atheist',
 'target_religion_buddhist',
 'target_religion_christian',
 'target_religion_hindu',
 'target_religion_jewish',
 'target_religion_mormon',
 'target_religion_muslim',
 'target_religion_other',
 'target_religion',
 'target_origin_immigrant',
 'target_origin_migrant_worker',
 'target_origin_specific_country',
 'target_origin_undocumented',
 'target_origin_other',
 'target_origin',
 'target_gender_men',
 'target_gender_non_binary',
 'target_gender_transgender_men',
 'target_gender_transgender_unspecified',
 'target_gender_transgender_women',
 'target_gender_women',
 'target_gender_other',
 'target_gender',
 't

In [125]:
X_train = df_train["text"].reset_index()
y_train = df_train.drop(columns=["text"])
display(X_train.head())

Unnamed: 0,index,text
0,0,@ashoswai @globeandmail Muslims should be afra...
1,1,Fuck you fuck you you stupid cheating bitch go...
2,2,I'm a guy and I say this is bullshit. I'm a co...
3,3,31 Black Women Who Died In Police Custody URL
4,4,"I, a Catholic and a Jesuit, am grateful for th..."


In [126]:
max_length = X_train["text"].str.len().max()
print(f"Maximum length of text: {max_length}")

Maximum length of text: 581


In [127]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_data(examples):
  # take a batch of texts
  text = examples["text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=512)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding

In [128]:
encoded_dataset = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)

Map: 100%|██████████| 813/813 [00:00<00:00, 6003.68 examples/s]
Map: 100%|██████████| 271/271 [00:00<00:00, 5810.24 examples/s]
Map: 100%|██████████| 272/272 [00:00<00:00, 6658.83 examples/s]


In [129]:
example = encoded_dataset["train"][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [130]:
tokenizer.decode(example['input_ids'])

'[CLS] @ ashoswai @ globeandmail muslims should be afraid of any place becoming muslim majority than about india, looking at the condition in all muslim majority countries now. sunnis and shias could not reconcile in > 1300 years. how do you expect them to tolerate the hindus in india? # shiagenocide [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [P

In [131]:
example['labels']

[0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 1.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

In [132]:
[id2label[idx] for idx, label in enumerate(example['labels']) if label != 0]

['status',
 'dehumanize',
 'attack_defend',
 'target_religion_hindu',
 'target_religion_muslim',
 'target_religion',
 'target_origin_specific_country',
 'target_origin']

In [133]:
encoded_dataset.set_format("torch")

In [134]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           id2label=id2label,
                                                           label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [135]:
batch_size = 8
metric_name = "f1"

In [136]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)



In [138]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [139]:
encoded_dataset["train"][0]['labels'].type()

'torch.FloatTensor'

In [140]:
encoded_dataset["train"]['input_ids'][0]

tensor([  101,  1030,  6683,  2891, 21547,  1030,  7595,  5685, 21397,  7486,
         2323,  2022,  4452,  1997,  2151,  2173,  3352,  5152,  3484,  2084,
         2055,  2634,  1010,  2559,  2012,  1996,  4650,  1999,  2035,  5152,
         3484,  3032,  2085,  1012, 18883,  2015,  1998, 20474,  2015,  2071,
         2025, 21063,  1999,  1028, 19527,  2086,  1012,  2129,  2079,  2017,
         5987,  2068,  2000, 19242,  1996, 18221,  1999,  2634,  1029,  1001,
        20474,  6914, 10085,  5178,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [141]:
outputs = model(input_ids=encoded_dataset["train"]['input_ids'][0].unsqueeze(0), labels=encoded_dataset["train"][0]['labels'].unsqueeze(0))
outputs

SequenceClassifierOutput(loss=tensor(0.7002, grad_fn=<BinaryCrossEntropyWithLogitsBackward0>), logits=tensor([[ 0.4964,  0.1237,  0.7332,  0.4905,  0.4337, -0.2857,  0.2185, -0.3485,
         -0.4422, -0.5298,  0.5695,  0.0234,  0.3570,  0.1206, -0.6058, -0.3078,
         -0.4451, -0.0048, -0.6410, -0.0872, -0.2691,  0.2103,  0.2653, -0.6864,
         -0.3468, -0.0184,  0.2674, -0.7151, -0.4990,  0.1555,  0.2831, -0.0372,
          0.0360,  0.8311,  0.3228,  0.2725,  0.5254,  0.4875,  0.0399, -0.4417,
         -0.4516,  0.3269, -0.2376, -0.1142, -0.4359,  0.4867, -0.2368,  0.0204,
         -0.2572, -0.2442,  0.6405,  0.1258, -0.2876, -0.1532,  0.2366,  0.0225,
          0.2581,  0.0265,  0.1419, -0.1325, -0.3316, -0.1444, -0.3841]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [142]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["dev"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [143]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.324192,0.535123,0.708906,0.0




KeyboardInterrupt: 

In [157]:
model_path = "bert-finetuned-hate-speech"

# Load tokenizer and model
trained_model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Put model in eval mode
trained_model.eval()

# Create a DataLoader for the test dataset
test_loader = DataLoader(encoded_dataset['test'], batch_size=8)

results_df = pd.DataFrame(columns=['Text', 'Predicted_Values', 'Actual_Values'])

# Iterate through the test dataset and make predictions
for batch in test_loader:
    input_ids = batch['input_ids']
    attention_mask = batch['attention_mask']
    labels = batch['labels'].int()

    decoded_texts = [tokenizer.decode(ids, skip_special_tokens=True) for ids in input_ids]

    with torch.no_grad():
        outputs = trained_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.sigmoid(logits) > 0.5
        # Convert predictions to integers (1 or 0)
        predictions = predictions.int()

        batch_results = []
        for text, pred, actual in zip(decoded_texts, predictions.numpy(), labels.numpy()):
            batch_results.append({'Text': text, 'Predicted_Values': list(pred), 'Actual_Values': list(actual)})

        # Convert the list of results to a DataFrame
        batch_df = pd.DataFrame(batch_results)

        # Concatenate the batch DataFrame with the main results DataFrame
        results_df = pd.concat([results_df, batch_df], ignore_index=True)
    
display(results_df)

Unnamed: 0,Text,Predicted_Values,Actual_Values
0,@ vipulchavada _ @ cartelx4 @ cnnbrk go have s...,"[1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
1,"at 7 : 19 is why i hate people, women especial...","[1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, ..."
2,basirhat mp nusrat jahan stood beside the mu $...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,"headline reads, "" first generation multicultur...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
4,notwithstanding marriyum aurangzeb sahiba's po...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ..."
...,...,...,...
266,if you're gonna be a racist scumbag at least o...,"[1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, ..."
267,my masculine is awakening from my spiritual el...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
268,i don't know anyone who's pursuing socioeconom...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, ..."
269,wishing all of our muslim customers eid al - a...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
