In [2]:
# Use a pipeline as a high-level helper
from transformers import pipeline

#device = 0 if torch.cuda.is_available() else -1

#print(device)

classifier = pipeline("text-classification", model="Hate-speech-CNERG/bert-base-uncased-hatexplain", device=0)

#, device=device
# original_text = "i dont think im getting my baby them white 9 he has two white j and nikes not even touched"

# classifier(original_text, return_all_scores=True)

Extract data from json file

In [10]:
import json

json_file = './data/dataset.json'

def extract_data(file):
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    sentences = []
    abuse_flags = []
    annotated_labels = []
    
    for key, entry in data.items():
        if 'post_tokens' in entry:
            post_tokens = entry['post_tokens']
            sentence = " ".join(post_tokens)
        else:
            sentence = " "
            print(f"Warning: Entry {key} is missing 'post_tokens' key")
        
        if 'annotators' in entry:
            labels = [annotator['label'] for annotator in entry['annotators']]
            if sum(label != "normal" for label in labels) >= 2:
                abuse_label = 1  # Abusive
            else:
                abuse_label = 0  # normal
            annotated_labels.append(labels)
        else:
            abuse_label = 0  # Default to normal if 'annotators' key is missing
            annotated_labels.append([])
            print(f"Warning: Entry {key} is missing 'annotators' key")
        
        sentences.append(sentence)
        abuse_flags.append(abuse_label)

    return sentences, abuse_flags, annotated_labels

original_sentences, annotated_labels, annotated_labels_original = extract_data(json_file) #length = 20148

file_path = "./data/annotated_labels_original.json"

with open(file_path, 'w') as json_file:
    json.dump(annotated_labels_original, json_file)
# print(len(original_sentences)) 20148
# print(annotated_labels[:5])
# print(classifier(original_sentences[:5], return_all_scores=True))

Do the predictions by the pipeline and extract classified results

In [5]:
classified_result = classifier(original_sentences, batch_size=512)



In [7]:
# from tqdm.auto import tqdm

def extract_labels(result_labeled):
    labels = []
    
    for instance in result_labeled:
        if instance['label'] == 'normal':
            labels.append(0)
        else:
            labels.append(1)
    
    return labels

#tqdm(classifier(original_sentences, batch_size=24), total=len(original_sentences))
classified_labels = extract_labels(classified_result)

# for out in results:
#     print(out)

print(classified_result[:10])
print(classified_labels[:10])

with open("./data/classified_labels.json", "w") as f:
    json.dump(classified_labels, f, indent=4)

[{'label': 'normal', 'score': 0.7708712220191956}, {'label': 'normal', 'score': 0.637093186378479}, {'label': 'hate speech', 'score': 0.6612201929092407}, {'label': 'hate speech', 'score': 0.6242997050285339}, {'label': 'offensive', 'score': 0.667515754699707}, {'label': 'hate speech', 'score': 0.920625627040863}, {'label': 'hate speech', 'score': 0.7545786499977112}, {'label': 'hate speech', 'score': 0.6379234194755554}, {'label': 'hate speech', 'score': 0.8680278658866882}, {'label': 'hate speech', 'score': 0.8520246744155884}]
[0, 0, 1, 1, 1, 1, 1, 1, 1, 1]


Recalculate the classified labels, 3 tags

In [8]:
# from tqdm.auto import tqdm

def extract_labels(result_labeled):
    labels = []
    
    for instance in result_labeled:
        if instance['label'] == 'normal':
            labels.append(1)
        elif instance['label'] == 'hate speech':
            labels.append(0)
        elif instance['label'] == 'offensive':
            labels.append(2)
        else:
            labels.append(-1) # cannot be recognized
    
    return labels

#tqdm(classifier(original_sentences, batch_size=24), total=len(original_sentences))
classified_labels_3_labels = extract_labels(classified_result)

# for out in results:
#     print(out)

print(classified_result[:10])
print(classified_labels_3_labels[:10])

with open("./data/classified_labels_three_labels.json", "w") as f:
    json.dump(classified_labels_3_labels, f, indent=4)

[{'label': 'normal', 'score': 0.7708712220191956}, {'label': 'normal', 'score': 0.637093186378479}, {'label': 'hate speech', 'score': 0.6612201929092407}, {'label': 'hate speech', 'score': 0.6242997050285339}, {'label': 'offensive', 'score': 0.667515754699707}, {'label': 'hate speech', 'score': 0.920625627040863}, {'label': 'hate speech', 'score': 0.7545786499977112}, {'label': 'hate speech', 'score': 0.6379234194755554}, {'label': 'hate speech', 'score': 0.8680278658866882}, {'label': 'hate speech', 'score': 0.8520246744155884}]
[1, 1, 0, 0, 2, 0, 0, 0, 0, 0]


  Calculate the scores

In [5]:
from sklearn.metrics import precision_score, recall_score, f1_score

precision = precision_score(annotated_labels, classified_labels, average='binary')
recall = recall_score(annotated_labels, classified_labels, average='binary')
f1 = f1_score(annotated_labels, classified_labels, average='binary')

print(f"Precision: {precision}, Recall: {recall}, F1 Score: {f1}")

Precision: 0.8411602209944752, Recall: 0.8393870601589103, F1 Score: 0.84027270513757
