# Baseline Preprocessing

## Data & Package

In [1]:
import os
import pandas as pd
import numpy as np
import re
import sys
sys.path.append("..")  # Adds the parent directory to the path

project_root = os.path.dirname(os.getcwd())
data_dir = os.path.join(project_root, 'datasets')
data_file = os.path.join(data_dir, 'raw/dataset_full.csv')

df = pd.read_csv(data_file)
df.head()

Unnamed: 0,source,text,label,id
0,hate_speech,!!! RT @mayasolovely: As a woman you shouldn't...,2,4ecc4591238c4855bd54ea0d584f3054
1,hate_speech,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...,1,c682b650f3b24e6b94b36b89acd68e57
2,hate_speech,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...,1,9c92c46021824d89b96b0bba2b2b5a83
3,hate_speech,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...,1,c4ab2ea47a3e4e3bbbf530d273cc244f
4,hate_speech,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...,1,23e3092360e54bca85a5b0336ed8cf8e


In [37]:
import nltk
from nltk.corpus import stopwords, wordnet, words
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from utils.cleaning_items import slang_dict, REGEX_REMOVE, REGEX_REPLACE

# NLTK resources
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('words', quiet=True)
nltk.download('punkt_tab', quiet=True)
stop_words = stopwords.words('english')
combined_corpus = set(words.words()) | set(wordnet.words())
combined_corpus = {word.lower() for word in combined_corpus}

## Text cleaning

In [4]:
from symspellpy import SymSpell, Verbosity

# Load precompiled dictionary file or corpus
sym_spell = SymSpell(max_dictionary_edit_distance=2)
sym_spell.load_dictionary('frequency_dictionary_en_82_765.txt', 0, 1)

2024-11-16 18:14:57,594: E symspellpy.symspellpy] Dictionary file not found at frequency_dictionary_en_82_765.txt.


False

In [18]:
from symspellpy import SymSpell, Verbosity
import pkg_resources

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt"
)
# term_index is the column of the term and count_index is the
# column of the term frequency
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

True

In [34]:
# lookup suggestions for single-word input strings
input_term = "word"  # misspelling of "apostrophe"

# max edit distance per lookup
# (max_edit_distance_lookup <= max_dictionary_edit_distance)
suggestions = sym_spell.lookup(
    input_term, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)
    
    
suggestions[0].term

'word'

In [38]:
def clean_text(text: str) -> str:
    """
    Cleans the input text by applying several preprocessing steps.
    Args:
        text (str): The input text to be cleaned.
    Returns:
        str: The cleaned text.
    """
    # Apply REGEX_REMOVE and REGEX_REPLACE
    for pattern in REGEX_REMOVE:
        text = re.sub(pattern, "", text)
    for pattern, repl in REGEX_REPLACE.items():
        text = re.sub(pattern, repl, text)
            
    # Apply additionnal text cleaning steps
    text = re.sub(r'^RT @\w+: ', '', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'\b\w*jpeg\w*\b|\b\w*jpg\w*\b', '', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'@\w+', '<PERSON>', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'\b(\w+)\b\s+\1\b', '', text)
    text = text.strip().lower()
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'[\x80-\xFF]', '', text)
    return text

In [None]:
from textblob import TextBlob

def correct_text_blob(text: str, stop_words, slang_dict: dict) -> str:
    """
    Corrects the given text by replacing slang words, removing stop words, and performing spelling correction.
    Args:
        text (str): The input text to be corrected.
        stop_words (set): A set of stop words to be removed from the text.
        slang_dict (dict): A dictionary where keys are slang words and values are their corresponding replacements.
    Returns:
        str: The corrected text after slang replacement, stop word removal, and spelling correction.
    """
    tokens = text.split()
    tokens = [slang_dict.get(word, word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    corrected_text = str(TextBlob(text).correct())
    return corrected_text

In [35]:
def correct_text_noblob(text: str, stop_words, slang_dict: dict) -> str:
    """
    Corrects the given text by replacing slang words, removing stop words, and performing spelling correction.
    Args:
        text (str): The input text to be corrected.
        stop_words (set): A set of stop words to be removed from the text.
        slang_dict (dict): A dictionary where keys are slang words and values are their corresponding replacements.
    Returns:
        str: The corrected text after slang replacement, stop word removal, and spelling correction.
    """
    tokens = text.split()
    tokens = [slang_dict.get(word, word) for word in tokens]
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True)[0].term for word in tokens]
    text = ' '.join(tokens)
    corrected_text = str(TextBlob(text).correct())
    return corrected_text

In [53]:
test_df = df.sample(100)
test_df['text_cleaned'] = test_df['text'].apply(lambda x: clean_text(x))

In [54]:
import time
# test with Textblob
start_time = time.time()
test_df['text_corrected_textblob'] = test_df['text_cleaned'].apply(lambda x: correct_text_blob(x, stop_words, slang_dict))
end_time = time.time()
time_textblob = end_time - start_time

# test with symspell
start_time = time.time()
test_df['text_corrected_symspell'] = test_df['text_cleaned'].apply(lambda x: correct_text_noblob(x, stop_words, slang_dict))
end_time = time.time()
time_symspell = end_time - start_time

print(f"Time taken for TextBlob: {time_textblob:.2f} seconds")
print(f"Time taken for symspell: {time_symspell:.2f} seconds")

Time taken for TextBlob: 38.21 seconds
Time taken for symspell: 34.61 seconds


In [52]:
print(test_df[['text', 'text_cleaned', 'text_corrected_textblob', 'text_corrected_symspell']].iloc[0,3])
print(test_df[['text', 'text_cleaned', 'text_corrected_textblob', 'text_corrected_symspell']].iloc[0,1])

hello understand in considers bind nail build three official christian family organizations cos organization within in move cos section need
hello i understand that the inc considers the binhi kadiwa and buklod to be the three official christian family organizations but the cws is an organization within the inc we can move the cws to its own section if need be


## Lemmatizing tests

In [43]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')

def lemma_text(tokens: list) -> list:
    lemmatizer = WordNetLemmatizer()
    ls = [lemmatizer.lemmatize(token, 'v') for token in tokens]
    ls = [lemmatizer.lemmatize(token, 'n') for token in ls]
    ls = [lemmatizer.lemmatize(token, 'a') for token in ls]
    return ls

test_sentence = "I am running in the nicest park of the city texts"

# Tokenize the sentence
tokens = word_tokenize(test_sentence, preserve_line=True)
lemma_tokens = lemma_text(tokens)

In [44]:
lemma_tokens

['I', 'be', 'run', 'in', 'the', 'nice', 'park', 'of', 'the', 'city', 'text']