In [39]:
import pandas as pd
from tqdm import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

tokenizer = AutoTokenizer.from_pretrained("tum-nlp/bert-hateXplain")
model = AutoModelForSequenceClassification.from_pretrained("tum-nlp/bert-hateXplain")
hate_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)

df = pd.read_csv("../mdatasci/compsci-760/project/supervision_test10_threads.csv")
df = df.rename(columns={"is_hate": "is_hate_legacy", "hate_label": "hate_label_legacy"})

Device set to use mps:0


In [None]:
def get_token_length(text: str) -> int:
    try:
        encoded_input_fixed = tokenizer(
            text, max_length=512, truncation=True, return_tensors="pt"
        )
        return encoded_input_fixed["input_ids"].shape[1]
    except Exception as e:
        print(text)
        print(e)
        return 0


df = df[df["body"].notnull()].copy()
df["token_len"] = df["body"].map(get_token_length)

512

In [None]:
def classify_text(text):
    clean_text = str(text).replace("\n", " ")
    result = hate_classifier(clean_text)
    return result[0]["label"], result[0]["score"]


valid_df = df[
    (df["body"] != "[removed]") & (df["body"] != "[deleted]") & (df["token_len"] < 512)
].copy()

print(f"Processing {len(valid_df)} posts.")

tqdm.pandas(desc="Classifying posts")

print(f"Shape before: {valid_df.shape}")

classification_results = valid_df["body"].progress_apply(classify_text)
valid_df["hate_label"] = [result[0] for result in classification_results]
valid_df["hate_score"] = [result[1] for result in classification_results]

print(f"Shape after: {valid_df.shape}")
print(valid_df["hate_label"].value_counts())
valid_df = valid_df.sort_values("hate_score")
valid_df.to_csv("supervision_test10_threads_hatexplain.csv")

Processing 918 posts.
Shape before: (918, 11)


Classifying posts: 100%|██████████| 918/918 [00:04<00:00, 227.31it/s]

Shape after: (918, 12)
hate_label
non-toxic    811
toxic        107
Name: count, dtype: int64



