# Baseline Preprocessing

## Data & Package

In [30]:
import os
import pandas as pd
import numpy as np
import re
import sys
sys.path.append("..")  # Adds the parent directory to the path

In [31]:
project_root = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_root, 'datasets')
data_file = os.path.join(data_dir, 'raw/merged_dataset.csv')

df = pd.read_csv(data_file)
df.head()

Unnamed: 0,source,text,label,id
0,hate_speech,!!! RT @mayasolovely: As a woman you shouldn't...,2,4ecc4591238c4855bd54ea0d584f3054
1,hate_speech,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,c682b650f3b24e6b94b36b89acd68e57
2,hate_speech,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,9c92c46021824d89b96b0bba2b2b5a83
3,hate_speech,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,c4ab2ea47a3e4e3bbbf530d273cc244f
4,hate_speech,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,23e3092360e54bca85a5b0336ed8cf8e


## Balancing

Data rebalancing - DownSample of Class 0 (not harmful comments)

In [32]:
# Rebalance the data by selecting 30000 random samples from class 0
df_balanced = pd.concat([df[df.label == 0].sample(n=30000), df[df.label != 0]])
df_balanced.label.value_counts()

label
0    30000
1    24897
2    14681
Name: count, dtype: int64

## Text cleaning

In [33]:
df = df_balanced.sample(frac=1).sample(frac=0.1)
df.shape

(6958, 4)

In [35]:
import re
import pandas as pd
from textblob import TextBlob
from nltk.tokenize import word_tokenize
from utils import slang_dict

# Comprehensive text cleaning function
def clean_text(text):
    """
    Clean text by performing a series of regex substitutions.
    """
    # Step 1: Remove retweet patterns (RT @user:)
    text = re.sub(r'^RT @\w+: ', '', text)

    # Step 2: Remove URLs, image formats (jpg, jpeg), and line breaks
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'\b\w*jpeg\w*\b|\b\w*jpg\w*\b', '', text)
    text = re.sub(r'\n', ' ', text)

    # Step 3: Replace mentions with a placeholder and remove punctuation
    text = re.sub(r'@\w+', '<PERSON>', text)
    text = re.sub(r'[^\w\s]', '', text)

    # Step 4: Remove numbers, strip whitespaces, and remove repeating words
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(\w+)\b\s+\1\b', '', text)
    text = text.strip().lower()

    return text
# Apply text cleaning function in one pass
df['text'] = df['text'].apply(clean_text)

# Function to correct spelling and slang terms
def correct_and_tokenize(text):
    """
    Replace slang, correct spelling, and tokenize.
    """
    # Step 1: Replace slang terms
    tokens = [slang_dict.get(word, word) for word in text.split()]
    
    # Step 2: Join tokens and correct spelling with TextBlob
    corrected_text = str(TextBlob(' '.join(tokens)).correct())
    
    # Step 3: Tokenize the corrected text
    return word_tokenize(corrected_text, preserve_line=True)

# Apply correction and tokenization
df['tokens'] = df['text'].apply(correct_and_tokenize)

KeyboardInterrupt: 

##### Basic Text cleaning

In [None]:
## Function to remove the RT pattern
#def remove_rt(text: str) -> str:
#    # Use regex to substitute the pattern with an empty string
#    return re.sub(r'^RT @\w+: ', '', text)
## Apply the function to the text column
#df['text'] = df.text.apply(remove_rt)
#
## Replace http patterns
#df['text'] = df['text'].str.replace(r'http\S+', ' ', regex=True)
#
## Replace words that contains jpeg with empty string
#df['text'] = df['text'].str.replace(r'\b\w*jpeg\w*\b', '', regex=True)
#df['text'] = df['text'].str.replace(r'\b\w*jpg\w*\b', '', regex=True)
#
## Lowercasing
#df['text'] = df['text'].str.lower()
#
## Replace mentions (@)
#df['text'] = df['text'].str.replace(r'@\w+', '<PERSON>', regex=True)
#
## Remove punctuation
#df['text'] = df['text'].str.replace(r'[^\w\s]', '', regex=True)
#
## Remove numbers
#df['text'] = df['text'].str.replace(r'\d+', '', regex=True)
#
## Remove linebreaks characters
#df['text'] = df['text'].str.replace(r'\n', ' ', regex=True)
#
## Strip leading and trailing whitespaces
#df['text'] = df['text'].str.strip()

##### Tokenization

In [27]:
# Remove words that repeats immediately (\b(\w+)\b\s+\1\b)
df['text'] = df['text'].str.replace(r'\b(\w+)\b\s+\1\b', '', regex=True)

In [28]:
import nltk
from nltk.tokenize import word_tokenize

# Tokenize the text
df['tokens'] = df['text'].apply(lambda x : word_tokenize(x, preserve_line=True))

##### Advanced text cleaning

In [None]:
#from utils import slang_dict    
#from textblob import TextBlob
#
#def correct_flatten(tokens):
#    # Replace slang and correct in a single pass
#    slangless_tokens = [slang_dict.get(token, token) for token in tokens]
#    
#    # Join tokens to make a sentence for batch processing
#    phrase = ' '.join(slangless_tokens)
#    
#    # Correct the entire phrase to reduce TextBlob calls
#    corrected_phrase = str(TextBlob(phrase).correct())
#    
#    # Split the corrected phrase back into tokens
#    return corrected_phrase.split()
#df['tokens'] = df['tokens'].apply(correct_flatten)

# Correct mispellings and slang words
#def correct_flatten(tokens):
#    slangless_token = [slang_dict.get(token, token) for token in tokens]
#    corrected_token = [TextBlob(token).correct() for token in slangless_token]
#    return [item for sublist in corrected_token for item in (sublist if isinstance(sublist, list) else [sublist])]
#df['tokens'] = df['tokens'].apply(correct_flatten)



KeyboardInterrupt: 

#### 3-gram analysis

#### Word frequency analysis (in sentences and in corpus)

##### After embedding, check for internal cosine similarity