In [91]:
import pandas as pd
import numpy as np
import os
import re
import emoji

from sklearn.model_selection import train_test_split

raw_path = '../data/raw/'
processed_path = '../data/processed'

In [92]:
ishate_path = os.path.join(raw_path, 'ishate')
ihc_path = os.path.join(raw_path, 'ihc')

## ISHate

In [93]:
def none_to_str(val):
    if not val:
        return "Not HS"
    
    return val

In [94]:
def encode_labels(val):
    if val == "Not HS":
        return 0
    elif val == "Explicit HS":
        return 1
    elif val == "Implicit HS":
        return 2

In [95]:
# remove usernames, urls, tags, and extra whitespace. Cast everything to lowercase
def clean_text(text):
    cleaned = text
    cleaned = re.sub(r'#([^ ]*)', r'\1', cleaned)
    cleaned = re.sub(r'https.*[^ ]', 'URL', cleaned)
    cleaned = re.sub(r'http.*[^ ]', 'URL', cleaned)
    cleaned = re.sub(r'@([^ ]*)', '@USER', cleaned)
    cleaned = emoji.demojize(cleaned)
    cleaned = re.sub(r'(:.*?:)', r' \1 ', cleaned)
    cleaned = re.sub(' +', ' ', cleaned)
    cleaned = cleaned.lower()

    return cleaned

In [96]:
start, stop = 0, 0

splits = ['train', 'val', 'test']

for split in splits:

    df_path = os.path.join(ishate_path, f'ishate_{split}.parquet.gzip')
    df = pd.read_parquet(df_path)

    stop += df.shape[0]
    df['id'] = range(start, stop)
    start += df.shape[0]

    df['label_name'] = df['implicit_layer'].apply(none_to_str)
    df['label'] = df['label_name'].apply(encode_labels)
    df['cleaned_text'] = df['text'].apply(clean_text)
    df = df[['id', 'text', 'cleaned_text', 'label_name', 'label']]
    # df.insert(0, 'id', df.index)
    
    save_path = f'ishate/ishate_{split}.csv'
    print(save_path)

    df.to_csv(os.path.join(processed_path, save_path), index=False)
    print(df['id'].describe())

ishate/ishate_train.csv
count    20381.000000
mean     10190.000000
std       5883.632254
min          0.000000
25%       5095.000000
50%      10190.000000
75%      15285.000000
max      20380.000000
Name: id, dtype: float64
ishate/ishate_val.csv
count     4367.000000
mean     22564.000000
std       1260.788642
min      20381.000000
25%      21472.500000
50%      22564.000000
75%      23655.500000
max      24747.000000
Name: id, dtype: float64
ishate/ishate_test.csv
count     4368.000000
mean     26931.500000
std       1261.077317
min      24748.000000
25%      25839.750000
50%      26931.500000
75%      28023.250000
max      29115.000000
Name: id, dtype: float64


## Implicit Hate Corpus

In [97]:
def normalize_labels(val):
    if val == "not_hate":
        return "Not HS"
    elif val == "implicit_hate":
        return "Implicit HS"
    elif val == "explicit_hate":
        return "Explicit HS"

In [98]:
ihc = pd.read_csv(os.path.join(ihc_path, 'implicit_hate_v1_stg1_posts.tsv'), sep='\t')

print(ihc.shape)
ihc['label_name'] = ihc['class'].apply(normalize_labels)
ihc['label'] = ihc['label_name'].apply(encode_labels)
ihc['text'] = ihc['post']

train, val_test = train_test_split(ihc, test_size=0.3, random_state=42, stratify=ihc['label'], shuffle=True)

val, test = train_test_split(val_test, test_size=0.5, random_state=42, stratify=val_test['label'], shuffle=True)

(21480, 2)


In [99]:
pd.DataFrame(
    [train['label'].value_counts(),
     val['label'].value_counts(),
     test['label'].value_counts()]
)

label,0,2,1
count,9304,4970,762
count,1994,1065,163
count,1993,1065,164


In [100]:
start, stop = 0, 0

splits = ['train', 'val', 'test']

for split in splits:

    df = eval(split)
    print(df.shape)

    stop += df.shape[0]
    df['id'] = range(start, stop)
    start += df.shape[0]

    df['cleaned_text'] = df['text'].apply(clean_text)
    df = df[['id', 'text', 'cleaned_text', 'label_name', 'label']]
    # df.insert(0, 'id', df.index)
    
    save_path = f'ihc/ihc_{split}.csv'
    print(save_path)

    df.to_csv(os.path.join(processed_path, save_path), index=False)
    print(df['id'].describe())

(15036, 5)
ihc/ihc_train.csv
count    15036.000000
mean      7517.500000
std       4340.663659
min          0.000000
25%       3758.750000
50%       7517.500000
75%      11276.250000
max      15035.000000
Name: id, dtype: float64
(3222, 5)
ihc/ihc_val.csv
count     3222.00000
mean     16646.50000
std        930.25561
min      15036.00000
25%      15841.25000
50%      16646.50000
75%      17451.75000
max      18257.00000
Name: id, dtype: float64
(3222, 5)
ihc/ihc_test.csv
count     3222.00000
mean     19868.50000
std        930.25561
min      18258.00000
25%      19063.25000
50%      19868.50000
75%      20673.75000
max      21479.00000
Name: id, dtype: float64
