# Import Data & Packages

In [2]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
project_root = os.path.dirname(os.getcwd())
data_directory = os.path.join(project_root, 'datasets')
df1 = pd.read_csv(os.path.join(data_directory, 'originals/hate_speech_dataset.csv'))
df2 = pd.read_csv(os.path.join(data_directory, 'originals/toxic_comment.csv'))

# Merging datasets

## Class distribution

##### Hate Speech DataSet

The goal is to create a one-one mapping between the two datasets.

In [4]:
# Drop Unneccasry colums
df1.drop(columns=['Unnamed: 0', 'hate_speech', 'count', 'offensive_language', 'neither'], inplace=True)

# Rename columns
df1.columns = ['label', 'text']

# Add a column to indicate the source of the data
df1['source'] = 'hate_speech'

##### Toxic comment dataset

On this dataset, we must perform reclassification of the data to map to the first dataset.

In [6]:
# Create new class labels for the toxic comment dataset
# toxic and severe toxic comments are considered as toxic
# obsene, threat, insult and identity hate are considered as hate_speech
# clean comments are considered as neither


# Function to create new labels
def label_mapping(df : pd.DataFrame):
    if df['obscene'] == 1 or df['threat'] == 1 or df['insult'] == 1 or df['identity_hate'] == 1:
        return 2
    elif df['toxic'] == 1 or df['severe_toxic'] == 1:
        return 1
    else:
        return 0

# Apply the functiont to the dataset
df2['label'] = df2.apply(label_mapping, axis=1)
# Drop unnecessary columns
df2.drop(columns=['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], inplace=True)

# Add a column to indicate the source of the data
df2['source'] = 'toxic_comment'

# Rename the text column
df2.rename(columns={'comment_text': 'text'}, inplace=True)

#### Concatenate the two datasets

In [7]:
import uuid

# Concatenate the two datasets
df = pd.concat([df1[['source', 'text', 'label']], df2[['source', 'text', 'label']]], ignore_index=True)

df['id'] = [uuid.uuid4().hex for _ in range(len(df))]
df.head()

Unnamed: 0,source,text,label,id
0,hate_speech,!!! RT @mayasolovely: As a woman you shouldn't...,2,834ab1a0113a4bca911c957d4888237e
1,hate_speech,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,bd7bf611bc894ccead0806e8aa37b4e7
2,hate_speech,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,ef6d509a073444d98873d1dd1947e730
3,hate_speech,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,5b40709b68d140bc97935db8705b7154
4,hate_speech,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,8bc1b42117c14f00bafc3c2a974031d7


#### Rebalance data for training

For training, we will select :
- All the 'toxic' comments (label = 1) #25000 rows
- All the 'hate_speech' comments (label = 2) #14500 rows
- 16.5% * 25000 randomly choose 'neither' comments from hate speech dataset
- 83.5% * 25000 randomly choose 'neither' comments from toxic comment dataset

In [None]:
# Instigate class distribution
df['label'].value_counts()

label
0    144776
1     24897
2     14681
Name: count, dtype: int64