# Dataset Normalizer

In [46]:
import re, os, pandas as pd, pickle

### File Handling

In [47]:
df_path = os.path.abspath("../dataset/mlthsc.csv")

hate_speech = pd.read_csv(df_path, index_col='ID')

### Preprocessing

For simplicity and the removal of inconsistencies in the dataset, we have considered the following criteria in cleaning the dataset before entering the model.
1. 	Conversion of all texts to lowercase
2. 	Removal of unimportant data (link, emoji, username, punctuation, hashtag, digit)
3. 	 Removal of unnecessary white spaces in the text
4. 	Shortening the text into their standard format (eg. “nooooooo” to “no”)
5. 	Correcting misspelled words

In [48]:
# Lowercase
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'[A-Z]', lambda y: y.group(0).lower(), x))

In [49]:
# Removal of unimportant links
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'http[s]?://\S+', '', x))

# emoji 
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', '', x))

# username
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'@\w+', '', x))

# punctuations
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

# hashtag
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'#', '', x))

In [50]:
# Removal of unnecessary white spaces
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [51]:
# Shortening the text into their standard format
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: re.sub(r'(\w)(\1{2,})', r'\1', x))

### Spelling Corrector

In [52]:
# Correcting the spelling of misspelled words in every hate speech text

# Load spelling corrector model
with open('../model/spellchecker_model.pkl', 'rb') as model_file:
    loaded_model = pickle.load(model_file)

In [53]:
# Use the loaded model to correct the spelling
hate_speech['Text'] = hate_speech['Text'].apply(lambda x: ' '.join([loaded_model.correction(word) for word in x.split()]))


### Export 

In [54]:
# Export to CSV file
hate_speech.to_csv('../dataset/cleaned_mlthsc.csv')