# Baseline Preprocessing

## Data & Package

In [25]:
import os
import pandas as pd
import numpy as np
import re
import sys
sys.path.append("..")  # Adds the parent directory to the path

project_root = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_root, 'datasets')
data_file = os.path.join(data_dir, 'raw/merged_dataset.csv')

df = pd.read_csv(data_file)
df.head()

Unnamed: 0,source,text,label,id
0,hate_speech,!!! RT @mayasolovely: As a woman you shouldn't...,2,4ecc4591238c4855bd54ea0d584f3054
1,hate_speech,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,c682b650f3b24e6b94b36b89acd68e57
2,hate_speech,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,9c92c46021824d89b96b0bba2b2b5a83
3,hate_speech,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,c4ab2ea47a3e4e3bbbf530d273cc244f
4,hate_speech,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,23e3092360e54bca85a5b0336ed8cf8e


## Balancing

Data rebalancing - DownSample of Class 0 (not harmful comments)

In [26]:
# Rebalance the data by selecting 30000 random samples from class 0
df_balanced = pd.concat([df[df.label == 0].sample(n=30000), df[df.label != 0]])
df_balanced.label.value_counts()

label
0    30000
1    24897
2    14681
Name: count, dtype: int64

## Text cleaning

df = df_balanced.sample(frac=1).sample(frac=0.1)
df = df.sample(frac=0.02)
df.shape

In [4]:
#import re
#import pandas as pd
#from textblob import TextBlob
#from nltk.tokenize import word_tokenize
#from setup.utils_setup import slang_dict
#from nltk.stem import WordNetLemmatizer
#import nltk
#from nltk.corpus import stopwords
#from tqdm import tqdm
#
#tqdm.pandas()
#
## Download stopwords if you haven't already
#nltk.download('stopwords')
#
## Define the list of stopwords
#stop_words = list(set(stopwords.words('english')))
#
## Comprehensive text cleaning function
#def clean_text(text):
#    """
#    Clean text by performing a series of regex substitutions.
#    """
#    # Step 1: Remove retweet patterns (RT @user:)
#    text = re.sub(r'^RT @\w+: ', '', text)
#
#    # Step 2: Remove URLs, image formats (jpg, jpeg), and line breaks
#    text = re.sub(r'http\S+', ' ', text)
#    text = re.sub(r'\b\w*jpeg\w*\b|\b\w*jpg\w*\b', '', text)
#    text = re.sub(r'\n', ' ', text)
#
#    # Step 3: Replace mentions with a placeholder and remove punctuation
#    text = re.sub(r'@\w+', '<PERSON>', text)
#    text = re.sub(r'[^\w\s]', '', text)
#
#    # Step 4: Remove numbers, strip whitespaces, and remove repeating words
#    text = re.sub(r'\d+', '', text)
#    text = re.sub(r'\b(\w+)\b\s+\1\b', '', text)
#    text = text.strip().lower()
#    
#    # Step 5: Remove special characters and emojis
#    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
#    text = re.sub(r'[\x80-\xFF]', '', text)
#
#    return text
## Apply text cleaning function in one pass
#df['text'] = df['text'].apply(clean_text)
#
## Function to correct spelling and slang terms
#def correct_text(text, stop_words: set, slang_dict: dict = slang_dict):
#    """
#    Replace slang terms and correct spelling in the entire sentence.
#    """
#    # Step 0: Tokenize the text
#    tokens = text.split()
#    
#    # Step 1: Replace slang terms
#    tokens = [slang_dict.get(word, word) for word in tokens]
#
#    # Step 2: Remove stopwords from tokens
#    tokens = [word for word in tokens if word not in stop_words]
#    
#    # Replace text with the corrected tokens
#    text = ' '.join(tokens)
#    
#    # Step 3: Correct spelling using TextBlob on the entire sentence
#    corrected_text = str(TextBlob(text).correct())
#    
#    return corrected_text
#
## Apply lemmaization
#def lemma_text(tokens):
#    """
#    Lemmatize tokens using WordNet.
#    """
#    lemmatizer = WordNetLemmatizer()
#    return [lemmatizer.lemmatize(token) for token in tokens]
#
#
## Apply text cleaning function in one pass
#df['text'] = df['text'].apply(clean_text)
## Apply correction
#df['text'] = df['text'].progress_apply(lambda x: correct_text(x, stop_words=stop_words))
## Tokenize the text
#df['tokens'] = df['text'].apply(word_tokenize, preserve_line=True)
## Remove rows with empty token lists
#df = df[df['tokens'].map(len) > 0]
## Apply lemmatization
#df['tokens'] = df['tokens'].apply(lemma_text)


#### Batch Creation

In [4]:
df.head()

Unnamed: 0,source,text,label,id
615,hate_speech,"""You talk like a fag and your shit's retarded....",1,17e75cb79605490e9c2b732260626754
17648,hate_speech,RT @TheDiLLon1: Y'all not gonna trash Spirit a...,2,ba6732e519e24cf48c15bc4a261727a7
25373,toxic_comment,"""\n\n Another crap article \nParrots what prom...",1,14d43018458d460ea6b19daf64956b87
58898,toxic_comment,You have already been notified about it but di...,0,f3d1e9a8c4894bf180bc01e9a174c3e1
53559,toxic_comment,""":""""Racist"""" is a slur, and implying Jewish ow...",1,efe0901b93de438ab97e15b7ed5f3ea1


In [8]:
from src.preprocessing import Tokenizer

tokenizer = Tokenizer(batch_size=10)

# Clean the text with tokenizer
df = tokenizer.clean(df)
df.shape

Index(['source', 'text', 'label', 'id'], dtype='object')


100%|██████████| 10/10 [00:03<00:00,  2.70it/s]
100%|██████████| 10/10 [00:03<00:00,  3.19it/s]
100%|██████████| 10/10 [00:02<00:00,  3.85it/s]
100%|██████████| 10/10 [00:04<00:00,  2.31it/s]
100%|██████████| 10/10 [00:02<00:00,  3.62it/s]
100%|██████████| 10/10 [00:07<00:00,  1.39it/s]
100%|██████████| 10/10 [00:04<00:00,  2.23it/s]


(70, 4)

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import words, stopwords, wordnet, brown, gutenberg


nltk.download('wordnet', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('words', quiet=True)

# Create sets of words from each corpus for faster lookup
words_corpus = set(words.words())
wordnet_corpus = set(wordnet.words())
#brown_corpus = set(brown.words())
#gutenberg_corpus = set(gutenberg.words())

# Combine the three corpora
combined_corpus = words_corpus | wordnet_corpus #| brown_corpus | gutenberg_corpus

# Convert to lowercase for case-insensitive matching
combined_corpus = {word.lower() for word in combined_corpus}

print(f"Total unique words in combined corpus: {len(combined_corpus)}")
print(f"Total unique words in WordNet: {len(wordnet_corpus)}")
print(f"Total unique words in Word: {len(words_corpus)}")

lemmatizer = WordNetLemmatizer()

sentence = "caring about details dogs running bottles bottling hehzhz aha"
# Example function to mark unknown words
def replace_unknown_tokens(tokens, unknown_token="<UNK>"):
    tokens = [token if token.lower() in combined_corpus else unknown_token for token in tokens]
    return tokens
tokens = word_tokenize(sentence, preserve_line=True)
lemmatized_tokens = [lemmatizer.lemmatize(token, 'v') for token in tokens]
final_tokens = [token if token.lower() in combined_corpus else "<UNK>" for token in lemmatized_tokens]



print(f"Original sentence: {sentence}")
print(f"Original tokens: {tokens}")
print(f"final tokens: {final_tokens}")

Total unique words in combined corpus: 331327
Total unique words in WordNet: 147306
Total unique words in Word: 323592
Original sentence: caring about details dogs running bottles bottling hehzhz aha
Original tokens: ['caring', 'about', 'details', 'dogs', 'running', 'bottles', 'bottling', 'hehzhz', 'aha']
final tokens: ['care', 'about', 'detail', 'dog', 'run', 'bottle', 'bottle', '<UNK>', 'aha']


#### Word frequency analysis (in sentences and in corpus)