In [1]:
# Import packages
import os
from datasets import load_dataset

## Datasets From Hugging Face

In [2]:
# https://huggingface.co/datasets

dataset_names = [
    'Paul/hatecheck-french',
    'hate_speech18',
    'hate_speech_offensive',
    'tweets_hate_speech_detection',
    'limjiayi/hateful_memes_expanded',
    'classla/FRENK-hate-en',
    'ucberkeley-dlab/measuring-hate-speech',
    # 'hatexplain'
]
# Output directory
output_dir = '../data/text/raw/'
# Output file extension
extension = 'tsv'

In [3]:
for dataset in dataset_names:
    print(f'Loading {dataset}...\n')
    # Loading dataset
    ds = load_dataset(dataset)
    # Converting to pandas dataframe
    ds.set_format(type='pandas')
    # Saving the different datasets split
    for key in ds.keys():
        df = ds[key][:]    
        name = dataset.replace('/', '_').replace('\\', '_')
        file_name = f'{name}_{key}.{extension}'
        file_path = os.path.join(output_dir, file_name)
        # Saving the data
        # with open(file_path, 'w', encoding='utf-8') as file:
        df.to_csv(file_path, sep='\t', index=False, encoding='utf-8')

Loading Paul/hatecheck-french...



Downloading readme:   0%|          | 0.00/3.49k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/980k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loading hate_speech18...



Downloading builder script:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.04k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.61k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10944 [00:00<?, ? examples/s]

Loading hate_speech_offensive...



Downloading builder script:   0%|          | 0.00/3.48k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/24783 [00:00<?, ? examples/s]

Loading tweets_hate_speech_detection...



Downloading builder script:   0%|          | 0.00/3.27k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.46k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.28M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/683k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/31962 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/17197 [00:00<?, ? examples/s]

Loading limjiayi/hateful_memes_expanded...



Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.99M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/59.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/120k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/224k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loading classla/FRENK-hate-en...



Downloading builder script:   0%|          | 0.00/4.83k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/4.83k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/789k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Loading ucberkeley-dlab/measuring-hate-speech...



Downloading readme:   0%|          | 0.00/4.03k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/14.1M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

### Hatexplain needs particular preprocessing

In [4]:
dataset = 'hatexplain'
print(f'Loading {dataset}...\n')
# Loading dataset
ds = load_dataset(dataset)
# Converting to pandas dataframe
ds.set_format(type='pandas')
# Saving the different datasets split
for key in ds.keys():
    df = ds[key][:]
    df['post_tokens'] = df['post_tokens'].apply(lambda row: ' '.join(row))
    name = dataset.replace('/', '_').replace('\\', '_')
    file_name = f'{name}_{key}.{extension}'
    file_path = os.path.join(output_dir, file_name)
    # Saving the data
    # with open(file_path, 'w', encoding='utf-8') as file:
    df.to_csv(file_path, sep='\t', index=False, encoding='utf-8')

Loading hatexplain...



Downloading builder script:   0%|          | 0.00/4.78k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.75k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.1k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.03M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/145k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/15383 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1922 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1924 [00:00<?, ? examples/s]