# Baseline Preprocessing

## Data & Package

In [2]:
import os
import pandas as pd
import numpy as np

In [7]:
project_root = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_root, 'datasets')
data_file = os.path.join(data_dir, 'raw/merged_dataset.csv')

df = pd.read_csv(data_file)
df.head()

Unnamed: 0,source,text,label,id
0,hate_speech,!!! RT @mayasolovely: As a woman you shouldn't...,2,4ecc4591238c4855bd54ea0d584f3054
1,hate_speech,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,c682b650f3b24e6b94b36b89acd68e57
2,hate_speech,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,9c92c46021824d89b96b0bba2b2b5a83
3,hate_speech,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,c4ab2ea47a3e4e3bbbf530d273cc244f
4,hate_speech,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,23e3092360e54bca85a5b0336ed8cf8e


## Balancing

Data rebalancing - DownSample of Class 0 (not harmful comments)

In [9]:
# Rebalance the data by selecting 30000 random samples from class 0
df_balanced = pd.concat([df[df.label == 0].sample(n=30000), df[df.label != 0]])
df_balanced.label.value_counts()

label
0    30000
1    24897
2    14681
Name: count, dtype: int64

## Text cleaning

In [10]:
df = df_balanced
df.head()

Unnamed: 0,source,text,label,id
124619,toxic_comment,"""\n\n Please do not vandalize pages, as you di...",0,1beffefdb07b4574aa0b78b0bd848efb
153778,toxic_comment,Donaire's nationality should be Filipino. It s...,0,3bfc437551384d6c926c16355504de95
156639,toxic_comment,"(Oops, forgot to sign Dave)",0,c1af40a4c56c43b9bbe2b6f9ce704d45
70376,toxic_comment,"""\n\n Pontic Greeks \n\nI removed the parapgra...",0,43bd6dbb06274abf842a408e843c9db7
183394,toxic_comment,"Fyslee, I wold advise that you drop the accusa...",0,f1cc20d0f1934398b9ff072c110263fb


In [16]:
# Lowercasing
df['text'] = df['text'].str.lower()

# Remove punctuation
df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)

# Remove numbers
df['text'] = df['text'].str.replace(r'\d+', '', regex=True)

# Remove linebreaks characters
df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)

# Strip leading and trailing whitespaces
df['text'] = df['text'].str.strip()

df = df.sample(frac=1).dropna()

##### Tokenization

In [39]:
from nltk.tokenize import word_tokenize

# Tokenize the text
df['tokens'] = df['text'].apply(lambda x : word_tokenize(x, preserve_line=True))