### Cleaning the HateXplain Dataset
The datasets are already cleaned and stored under Task1_Outputs. <br>
The folder contains: 
1) Binary Dataset folder for instances trained on the binary dataset <br>
2) Ternary Dataset folder for instances trained on the ternary dataset

Each folder contains two folders for instances with topic covariates and without.

In [None]:
from datasets import load_dataset
from datasets import concatenate_datasets
import nltk
import numpy as np

In [None]:
# Load the HateXplain dataset using Huggingface
hf_dataset = load_dataset('hatexplain', 'plain_text')

# Load the train split
hf_train_dataset = hf_dataset["train"]

# Load the test and validation split
hf_test_dataset = concatenate_datasets([hf_dataset["test"], hf_dataset["validation"]])

# Display the features
hf_train_dataset

In [None]:
df = hf_train_dataset.to_pandas()

In [None]:
print(df.loc[27, "rationales"])

In [None]:
for i in range(100):
    if df.loc[i, "annotators"]["label"][0] == 1 and df.loc[i, "annotators"]["label"][1] == 1 and df.loc[i, "annotators"]["label"][2] == 1:
        print(i)

In [None]:
df.loc[45, "label"]

In [None]:
why should you ban immigrants from operating small business every business start somewhere and tomorrow that will give people job

In [None]:
# Find the label assigned by most annotators
def most_common_label(numpy_list):
    return np.argmax(np.bincount(numpy_list))

# Convert the ternary list of labels (hateful, normal, offensive) to binary (normal, toxic)
def convert_ternary_list_to_binary(labels_ternary_list):
    labels_binary_list = []
    for label in labels_ternary_list:
        if label == 0 or label == 2:
            label = 1 # toxic
        else:
            label = 0 # normal
        labels_binary_list.append(label)
    return labels_binary_list

# Extract the aspects from the dataset that were selected by at least 2 annotators 
def identify_aspects(aspects_list, text):
    aspects = []
#     Only 2 annotators marked the text as toxic
    if len(aspects_list) == 2:
        array1 = aspects_list[0]
        array2 = aspects_list[1]
        for i in range(len(text)):
#             Find the tokens that were selected by both
            if array1[i] and array2[i]:
                aspects.append(text[i])
#     3 annotators marked the text as toxic
    if len(aspects_list) == 3:
        array1 = aspects_list[0]
        array2 = aspects_list[1]
        array3 = aspects_list[2]
        for i in range(len(text)):
#             Find the tokens that were selected by at least 2 annotators
            if array1[i] and array2[i] or array1[i] and array3[i] or array2[i] and array3[i]:
                aspects.append(text[i])
    return aspects

# Define the maximum number of extracted aspects for each document and add columns in the dataset for each aspect
def add_aspects_columns(pandas_dataset):
#     Max number of aspects
    num_col = 4
    aspect_list = pandas_dataset["aspects"]
    for i in range(num_col):
        col_list = []
        for array in aspect_list:
            try:
                col_list.append(array[i])
            except IndexError:
                col_list.append("")
#         Add a new column to the dataset with the aspects
        pandas_dataset["aspect_"+str(i+1)] = col_list

# Prepare the binary dataset (convert ternary dataset to binary, extract aspects, remove emojis & user tokens)  
def prepare_binary_dataset(pandas_dataset):
    for i in range(len(pandas_dataset)):
#         Set IDs
        pandas_dataset.loc[i, "id"] = i
#         Set binary label
        pandas_dataset.loc[i, "label"] = most_common_label(convert_ternary_list_to_binary(pandas_dataset.loc[i, "label"]["label"]))
#         Identify the aspects of each tweet
        pandas_dataset.loc[i, "aspects"] = identify_aspects(pandas_dataset.loc[i, "aspects"], pandas_dataset.loc[i, "text"])
#         Merge tokens into a string and remove emojis and <user>
        word_list = [word for word in pandas_dataset.loc[i, "text"] if word.isalpha()]
        text = " ".join(word_list)
        pandas_dataset.loc[i, "text"] = text
#     Add new columns to the dataset with the aspects
    add_aspects_columns(pandas_dataset)    

# Prepare the ternary dataset
def prepare_ternary_dataset(pandas_dataset):
    for i in range(len(pandas_dataset)):
#         Set IDs
        pandas_dataset.loc[i, "id"] = i
#         Set ternary label
        pandas_dataset.loc[i, "label"] = most_common_label(pandas_dataset.loc[i, "label"]["label"])
#         Identify the aspects of each tweet
        pandas_dataset.loc[i, "aspects"] = identify_aspects(pandas_dataset.loc[i, "aspects"], pandas_dataset.loc[i, "text"])
#         Merge tokens into a string and remove emojis and <user>
        word_list = [word for word in pandas_dataset.loc[i, "text"] if word.isalpha()]
        text = " ".join(word_list)
        pandas_dataset.loc[i, "text"] = text
#     Add new columns to the dataset with the aspects
    add_aspects_columns(pandas_dataset) 

In [None]:
# Change the dataset column headings (train)
new_train_dataset = hf_train_dataset.rename_column("annotators", "label")
new_train_dataset.rename_column_("post_tokens", "text")
new_train_dataset.rename_column_("rationales", "aspects")

# Change the dataset column headings (test)
new_test_dataset = hf_test_dataset.rename_column("annotators", "label")
new_test_dataset.rename_column_("post_tokens", "text")
new_test_dataset.rename_column_("rationales", "aspects")

In [None]:
# Convert to pandas dataset
binary_train_dataset = new_train_dataset.to_pandas()
binary_test_dataset = new_test_dataset.to_pandas()

# Prepare the binary dataset
prepare_binary_dataset(binary_train_dataset)
prepare_binary_dataset(binary_test_dataset)

In [None]:
# Convert to pandas dataset
ternary_train_dataset = new_train_dataset.to_pandas()
ternary_test_dataset = new_test_dataset.to_pandas()

# Prepare the binary dataset
prepare_ternary_dataset(ternary_train_dataset)
prepare_ternary_dataset(ternary_test_dataset)

In [None]:
binary_train_dataset.to_json("Task1_Outputs/Binary_Dataset/train.jsonlist", orient="records", lines=True)
binary_test_dataset.to_json("Task1_Outputs/Binary_Dataset/test.jsonlist", orient="records", lines=True)

In [None]:
ternary_train_dataset.to_json("Task1_Outputs/Ternary_Dataset/train.jsonlist", orient="records", lines=True)
ternary_test_dataset.to_json("Task1_Outputs/Ternary_Dataset/test.jsonlist", orient="records", lines=True)