In [42]:
import os
import re
import math
import string
from typing import Dict, List
from tqdm.notebook import tqdm

import emoji
import pandas as pd
from googletrans import Translator

In [2]:
# Specify the directory where your datasets are stored
raw_dir = '../data/raw/'
middle_dir = '../data/middle/'
final_dir = '../data/final/'

# Create the middle directory if it doesn't exist
os.makedirs(middle_dir, exist_ok=True)
os.makedirs(final_dir, exist_ok=True)

## classla_FRENK-hate-en

In [3]:
def clean_text(text: str) -> str:
    # Remove escape characters
    text = text.replace('\r', '').replace('\n', '')
    return text.strip()

def preprocess_func1(file_path: str) -> pd.DataFrame:
    # Télécharger dataset
    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Enlever missing values
    if data.isna().sum().any() > 0:
        data = data.dropna()

    # Drop duplicates
    if data.duplicated().sum() > 0:
        data = data.drop_duplicates()

    # Preprocess 'tweet' column
    data['text'] = data['text'].apply(clean_text)

    # Keep only specified columns
    columns_to_keep = ["label", "text"]
    data = data[columns_to_keep]

    # Mettre en ordre columns
    data = data[['label', 'text']]

    return data

In [4]:
# Specify the three dataset file names
dataset_files = [
    'classla_FRENK-hate-en_test.tsv',
    'classla_FRENK-hate-en_train.tsv',
    'classla_FRENK-hate-en_validation.tsv'
]

for file_name in dataset_files:
    file_path = os.path.join(raw_dir, file_name)

    # Preprocess the dataset
    preprocessed_data = preprocess_func1(file_path)

    # Save the preprocessed data for each dataset
    preprocessed_file_path = os.path.join(middle_dir, file_name)
    preprocessed_data.to_csv(
        preprocessed_file_path,
        sep='\t',
        encoding='utf-8',
        index=False
    )

## hate_speech_offensive

In [5]:
def clean_tweet(tweet: str) -> str:
    # Remove intro section of tweet
    tweet = re.sub(r'^.*?:', '', tweet)
    # Remove escape characters
    tweet = tweet.replace('\r', '').replace('\n', '')
    # Remove quotes
    tweet = tweet.replace('"', '')
    # Remove @user_name
    tweet = re.sub(r'\@\w+', '', tweet)
    # Remove url links
    tweet = re.sub(r'//*', '', tweet)
    return tweet.strip()

def preprocess_func2(file_path: str) -> pd.DataFrame:
    # Load dataset
    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Remove missing values
    data = data.dropna()

    # Drop duplicates
    data = data.drop_duplicates()

    # Preprocess 'tweet' column
    data['text'] = data['tweet'].apply(clean_tweet)

    # Keep only specified columns
    data = data[['class', 'text']]

    # Rename 'class' to 'label'
    data = data.rename(columns={'class': 'label'})

    # Replace values in the 'label' column: 0 -> 1, 2 -> 0
    data['label'] = data['label'].replace({0: 1, 2: 0})

    return data    

In [6]:
# Enregistrer
file_name = 'hate_speech_offensive_train.tsv'
file_path = os.path.join(raw_dir, file_name)

# Preprocess the dataset
preprocessed_data = preprocess_func2(file_path)

# Save the preprocessed data for each dataset
preprocessed_file_path = os.path.join(middle_dir, file_name)
preprocessed_data.to_csv(
    preprocessed_file_path,
    sep='\t',
    encoding='utf-8',
    index=False
)

## hate_speech18

In [7]:
def clean_text(text: str) -> str:
    # Remove escape characters
    text = text.replace('\r', '').replace('\n', '')
    return text.strip()

def preprocess_func3(file_path: str) -> pd.DataFrame:
    # Télécharger dataset
    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Enlever missing values
    if data.isna().sum().any() > 0:
        data = data.dropna()

    # Drop duplicates
    if data.duplicated().sum() > 0:
        data = data.drop_duplicates()
    
    # Preprocess 'tweet' column
    data['text'] = data['text'].apply(clean_text)

    # Keep only specified columns
    columns_to_keep = ["label", "text"]
    data = data[columns_to_keep]

    # Mettre en ordre columns
    data = data[['label', 'text']]

    return data

In [8]:
# Enregistrer
file_name = 'hate_speech18_train.tsv'
file_path = os.path.join(raw_dir, file_name)

# Preprocess the dataset
preprocessed_data = preprocess_func3(file_path)

# Save the preprocessed data for each dataset
preprocessed_file_path = os.path.join(middle_dir, file_name)
preprocessed_data.to_csv(
    preprocessed_file_path,
    sep='\t',
    encoding='utf-8',
    index=False
)

### limjiayi_hateful_memes_expanded

In [9]:
def preprocess_func4(file_path: str) -> pd.DataFrame:
    # Télécharger le dataset
    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Enlever les valeurs manquantes
    if data.isna().sum().any() > 0:
        data = data.dropna()

    # Supprimer les doublons
    if data.duplicated().sum() > 0:
        data = data.drop_duplicates()

    # Conserver seulement les colonnes spécifiées
    columns_to_keep = ["text","label"]
    data = data[columns_to_keep]

    # Ordre des colonnes : label, text
    data = data[['label', 'text']]

    return data


In [10]:
# Specify the three dataset file names
dataset_files = [
    'limjiayi_hateful_memes_expanded_train.tsv',
    'limjiayi_hateful_memes_expanded_test.tsv',
    'limjiayi_hateful_memes_expanded_validation.tsv'
]

for file_name in dataset_files:
    file_path = os.path.join(raw_dir, file_name)

    # Preprocess the dataset
    preprocessed_data = preprocess_func4(file_path)

    # Save the preprocessed data for each dataset
    preprocessed_file_path = os.path.join(middle_dir, file_name)
    preprocessed_data.to_csv(
        preprocessed_file_path,
        sep='\t',
        encoding='utf-8',
        index=False
    )

### tweets_hate_speech_detection_train

In [11]:
def clean_tweet(text: str) -> str:
    # Remove escape characters
    text = text.replace('\r', '').replace('\n', '')
    return text.strip()

def preprocess_func5(file_path: str) -> str:
    # Télécharger le dataset
    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    
    # Enlever les valeurs manquantes
    if data.isna().sum().any() > 0:
        data = data.dropna()

    # Supprimer les doublons
    if data.duplicated().sum() > 0:
        data = data.drop_duplicates()

    # Preprocess 'tweet' column
    data['tweet'] = data['tweet'].apply(clean_tweet)
    
    # Renommer les colonnes
    data = data.rename(columns={'tweet': 'text', 'label': 'label'})

    # Ordre des colonnes : label, text
    data = data[['label', 'text']]

    return data

In [12]:
# Specify the three dataset file names
dataset_files = [
    'tweets_hate_speech_detection_train.tsv',
    'tweets_hate_speech_detection_test.tsv'
]

for file_name in dataset_files:
    file_path = os.path.join(raw_dir, file_name)

    # Preprocess the dataset
    preprocessed_data = preprocess_func5(file_path)

    # Save the preprocessed data for each dataset
    preprocessed_file_path = os.path.join(middle_dir, file_name)
    preprocessed_data.to_csv(
        preprocessed_file_path,
        sep='\t',
        encoding='utf-8',
        index=False
    )

### ucberkeley-dlab_measuring-hate-speech

In [13]:
def preprocess_func6(file_path):
    # Charger le dataset
    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')

    # Vérifier les valeurs manquantes
    data.isna().sum()

    # Vérifier les doublons
    data.duplicated().sum()

    # Conserver seulement les colonnes nécessaires
    columns_to_keep = ["hate_speech_score", "text"]
    data = data[columns_to_keep]

    # Créer une colonne 'label' basée sur hate_speech_score
    # rappel hate_speech_score - continuous hate speech measure,
    # where higher = more hateful and lower = less hateful. > 0.5 is approximately hate speech,
    # < -1 is counter or supportive speech, and -1 to +0.5 is neutral or ambiguous.
    data['label'] = 0  # Initialise toutes les valeurs à 0

    # Définir les conditions pour lesquelles hate_speech_score est considéré comme "hateful"
    condition_hateful = data['hate_speech_score'] > 0

    # Affecter 1 aux lignes où la condition est vraie
    data.loc[condition_hateful, 'label'] = 1

    # Supprimer la colonne 'hate_speech_score'
    data = data.drop(['hate_speech_score'], axis=1)

    # Sélectionner les colonnes 'label' et 'text'
    data_cam = data[['label', 'text']]

    return data_cam

In [14]:
# Enregistrer
file_name = 'ucberkeley-dlab_measuring-hate-speech_train.tsv'
file_path = os.path.join(raw_dir, file_name)

# Preprocess the dataset
preprocessed_data = preprocess_func6(file_path)

# Save the preprocessed data for each dataset
preprocessed_file_path = os.path.join(middle_dir, file_name)
preprocessed_data.to_csv(
    preprocessed_file_path,
    sep='\t',
    encoding='utf-8',
    index=False
)

### Paul_hatecheck-french

In [15]:
def preprocess_func7(file_path: str) -> pd.DataFrame:
    # Charger le dataset

    data = pd.read_csv(file_path, sep='\t', encoding='utf-8')
    # Vérifier les valeurs manquantes
    data.isna().sum()

    # Vérifier les doublons
    data.duplicated().sum()

    # Conserver seulement les colonnes nécessaires
    columns_to_keep = ['test_case', 'label_annotated_maj']
    data = data[columns_to_keep]

    # Renommer les colones
    data = data.rename(columns={'test_case': 'text', 'label_annotated_maj': 'label'})

    # Créer une nouvelle colonne 'label' avec 0 ou 1
    data['label'] = data.apply(lambda row: 1 if row['label'] == 'hateful' else 1, axis=1)

    # Ordre des colonnes : label, text
    data = data[['label', 'text']]

    return data

In [16]:
# Enregistrer
file_name = 'Paul_hatecheck-french_test.tsv'
file_path = os.path.join(raw_dir, file_name)

# Preprocess the dataset
preprocessed_data = preprocess_func7(file_path)

# Save the preprocessed data for each dataset
preprocessed_file_path = os.path.join(middle_dir, file_name)
preprocessed_data.to_csv(
    preprocessed_file_path,
    sep='\t',
    encoding='utf-8',
    index=False
)

### hatexplain

In [17]:
def clean_text(text: str) -> str:
    # Remove escape characters
    text = text.replace('\r', '').replace('\n', '')
    return text.strip()

def preprocess_func8(file_path: str) -> str:
    # Télécharger le dataset
    data = pd.read_csv(file_path, sep='\t')

    # Enlever les valeurs manquantes
    if data.isna().sum().any() > 0:
        data = data.dropna()

    # Supprimer les doublons
    if data.duplicated().sum() > 0:
        data = data.drop_duplicates()

    # Conserver seulement les colonnes spécifiées
    columns_to_keep = ["post_tokens"]
    data = data[columns_to_keep]

    # Nettoyer les caractères spéciaux dans la colonne 'post_tokens'
    data['text'] = data['post_tokens'].apply(lambda x: re.sub(r'<user>', '', x))
    data['text'] = data['text'].apply(lambda x: re.sub(r'<utilisateur>', '', x))
    data['text'] = data['text'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

    # Preprocess 'tweet' column
    data['text'] = data['text'].apply(clean_tweet)

    # Ajouter une colonne 'label' avec la valeur 1
    data['label'] = 1

    # Ordre des colonnes : label, text
    data = data[['label', 'text']]

    return data

In [18]:
# Specify the three dataset file names
dataset_files = [
    'hatexplain_train.tsv',
    'hatexplain_test.tsv',
    'hatexplain_validation.tsv'
]

for file_name in dataset_files:
    file_path = os.path.join(raw_dir, file_name)

    # Preprocess the dataset
    preprocessed_data = preprocess_func8(file_path)

    # Save the preprocessed data for each dataset
    preprocessed_file_path = os.path.join(middle_dir, file_name)
    preprocessed_data.to_csv(
        preprocessed_file_path,
        sep='\t',
        encoding='utf-8',
        index=False
    )

## Concatenation and Translation

In [19]:
# Concatenate the dataframes from middle
root_data = '../data/middle'
files = [os.path.join(root_data, f) for f in os.listdir(root_data)]
dfs = [pd.read_csv(file, sep='\t', encoding='utf-8') for file in files]
concat_df = pd.concat(dfs, ignore_index=True, sort=False)

In [21]:
# Translator function
def google_translate(text: str or List[str], src_lang:str = 'en', dest_lang: str = 'fr') -> str: # type: ignore
  translation = Translator().translate(text=text, src=src_lang, dest=dest_lang)
  if isinstance(text, list):
    return [t.text for t in translation]
  else:
    return translation.text

In [None]:
err_id = []
for index, row in tqdm(concat_df.iterrows(), total=concat_df.shape[0]):
    try:
        concat_df.at[index, 'text'] = google_translate(
            text=str(row['text']),
            src_lang="en",
            dest_lang="fr"
        )
    except Exception as e:
        print('Row', index, ':', e)
        err_id.append(index)

In [None]:
print(f'Original length: {len(concat_df)}')
concat_df = concat_df.drop(index=err_id)
print(f'New length: {len(concat_df)}')

### Final Cleaning & Splitting

In [23]:
# Define list of acceptable characters
# Adapted from https://github.com/mindee/doctr/blob/main/doctr/datasets/vocabs.py
VOCABS: Dict[str, str] = {
    "digits": string.digits,
    "ascii_letters": string.ascii_letters,
    "punctuation": string.punctuation,
    "whitespaces": ' ',
    "emoji": str(list(emoji.EMOJI_DATA)),
    "currency": "£€¥¢฿",
    "ancient_greek": "αβγδεζηθικλμνξοπρστυφχψωΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩ",
    "arabic_letters": "ءآأؤإئابةتثجحخدذرزسشصضطظعغـفقكلمنهوىي",
    "persian_letters": "پچڢڤگ",
    "hindi_digits": "٠١٢٣٤٥٦٧٨٩",
    "arabic_diacritics": "",
    "arabic_punctuation": "؟؛«»—",
}
VOCABS["latin"] = VOCABS["digits"] + VOCABS["ascii_letters"] + VOCABS["punctuation"] + VOCABS["whitespaces"]
VOCABS["legacy_french"] = VOCABS["latin"] + "°" + "àâéèêëîïôùûçÀÂÉÈËÎÏÔÙÛÇ" + VOCABS["currency"] + VOCABS['emoji']

In [58]:
# Cleaning function
def text_cleaning(text:str, vocab: str) -> str:
    for char in text:
        # Check if character is in the acceptable voacbulary
        if char not in vocab:
            text = text.replace(char, '')
        text = text.replace('"', '')
    return text.strip() # Remove trailing whitespace

In [None]:
# Clean the dataframe
vocab = VOCABS["legacy_french"]
for index, row in tqdm(concat_df.iterrows(), total=len(concat_df)):
    if int(row['label']) == 0:
        concat_df.at[index,'label'] = 0
    else:
        concat_df.at[index,'label'] = 1
    try:
        concat_df.at[index,'text'] = text_cleaning(str(row['text']), vocab)
    except:
        concat_df.at[index,'text'] = ''

In [60]:
empty_indices = concat_df[concat_df['text'] == ''].index
print(f'Original length: {len(concat_df)}')
concat_df.drop(empty_indices, axis=0, inplace=True)
print(f'Is there null values: {concat_df.isnull().values.any()}')
clean_df = concat_df.dropna(how='any',axis=0) 
print(f'Final length: {len(clean_df)}')

Original length: 267842
Is there null values: False
Final length: 267841


In [61]:
train_split = 0.9
SEED_VAL = 999
# Shuffle data
clean_df_shuffle = clean_df.sample(frac=1, random_state=SEED_VAL).reset_index(drop=True)
id = math.floor(len(clean_df_shuffle)*train_split)
train_df = clean_df_shuffle[0:id]
test_df = clean_df_shuffle[id:]
print(f'Size of train set: {len(train_df)}')
print(f'Size of test set: {len(test_df)}')

Size of train set: 241056
Size of test set: 26785


In [62]:
train_df.to_csv('../data/final/train_data.tsv', sep='\t', encoding='utf-8', index=False)
test_df.to_csv('../data/final/text_data.tsv', sep='\t', encoding='utf-8', index=False)