In [None]:
%pip install datasets
from datasets import load_dataset

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
     -------------------------------------- 491.5/491.5 kB 2.6 MB/s eta 0:00:00
Collecting filelock
  Using cached filelock-3.18.0-py3-none-any.whl (16 kB)
Collecting pyarrow>=15.0.0
  Downloading pyarrow-20.0.0-cp311-cp311-win_amd64.whl (25.8 MB)
     ---------------------------------------- 25.8/25.8 MB 5.1 MB/s eta 0:00:00
Collecting dill<0.3.9,>=0.3.0
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
     -------------------------------------- 116.3/116.3 kB 6.6 MB/s eta 0:00:00
Collecting tqdm>=4.66.3
  Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Collecting xxhash
  Downloading xxhash-3.5.0-cp311-cp311-win_amd64.whl (30 kB)
Collecting multiprocess<0.70.17
  Downloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
     -------------------------------------- 143.5/143.5 kB 4.3 MB/s eta 0:00:00
Collecting fsspec[http]<=2025.3.0,>=2023.1.0
  Downloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
     


[notice] A new release of pip available: 22.3.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [182]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset('hatexplain_loader.py', trust_remote_code=True)
# Export each split to CSV
for split in dataset:
    df = dataset[split].to_pandas()
    df.to_csv(f'hatexplain_{split}.csv', index=False)

# Load each dataset split from CSV
train_df = pd.read_csv('hatexplain_train.csv')
val_df = pd.read_csv('hatexplain_validation.csv')
test_df = pd.read_csv('hatexplain_test.csv')

# Optional: quick checks
print("Train shape:", train_df.shape)
print("Validation shape:", val_df.shape)
print("Test shape:", test_df.shape)

train_df.head(2)

Train shape: (15383, 4)
Validation shape: (1922, 4)
Test shape: (1924, 4)


Unnamed: 0,id,annotators,rationales,post_tokens
0,23107796_gab,"{'label': array([0, 2, 2]), 'annotator_id': ar...","[array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,...",['u' 'really' 'think' 'i' 'would' 'not' 'have'...
1,9995600_gab,"{'label': array([2, 2, 0]), 'annotator_id': ar...","[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...",['the' 'uk' 'has' 'threatened' 'to' 'return' '...


In [183]:
import pandas as pd
import ast
import numpy as np
import re

# --- 1. Extract label list from 'annotators' using regex ---
def extract_labels_regex(text):
    try:
        match = re.search(r"label'\s*:\s*array\(\[([0-9,\s]+)\]", text)
        if match:
            label_str = match.group(1)
            return [int(x.strip()) for x in label_str.split(',')]
        else:
            return []
    except:
        return []

def fix_parse_and_join_to_string(raw):
    try:
        raw = raw.strip()
        if raw.startswith('[') and raw.endswith(']'):
            content = raw[1:-1].strip()
        else:
            content = raw

        # Insert commas between tokens
        fixed_content = re.sub(r"'\s+'", "', '", content)
        fixed_list_str = f"[{fixed_content}]"
        
        tokens = ast.literal_eval(fixed_list_str)
        if isinstance(tokens, list):
            combined = " ".join(tokens)
            return combined  # Just a plain string
        else:
            return ""
    except Exception as e:
        print(f"Parsing error: {e}")
        return ""


def label_distribution(labels, num_classes=3):
    counts = [0] * num_classes
    for lbl in labels:
        if 0 <= lbl < num_classes:
            counts[lbl] += 1
    return counts

def custom_normalize_label_distribution(counts):
    counts = np.array(counts, dtype=float)
    nonzero_mask = counts > 0
    nonzero_counts = counts[nonzero_mask]
    total = nonzero_counts.sum()
    normalized = np.zeros_like(counts)

    if total == 0:
        return list(normalized)

    normalized[nonzero_mask] = nonzero_counts / total
    return list(np.round(normalized, 2))

In [184]:
# Change Formatter
def changeFormat(df):
    df['labels'] = df['annotators'].apply(extract_labels_regex)
    df['count_dist'] = df['labels'].apply(label_distribution)
    df['norm_dist'] = df['count_dist'].apply(custom_normalize_label_distribution)
    df['norm_dist'] = df['norm_dist'].apply(lambda x: "[" + ", ".join(f"{v:.2f}" for v in x) + "]")
    df['text'] = df['post_tokens'].apply(fix_parse_and_join_to_string)
    return df[['text', 'norm_dist']]

train_df = changeFormat(train_df)
val_df = changeFormat(val_df)
test_df = changeFormat(test_df)

In [185]:
train_df.to_csv('hatexplain_train.csv', sep=';', index=False)
val_df.to_csv('hatexplain_validation.csv', sep=';', index=False)
test_df.to_csv('hatexplain_test.csv', sep=';', index=False)