# Preprocess Kaggle's post dataset for model training

In [122]:
# Import necessary libraries
import numpy as np
import spacy
import pandas as pd
import subprocess
from langdetect import detect
import re

## install english spacy model (if not already installed) and load
try:
    nlp = spacy.load("en_core_web_sm")
except:
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])
    nlp = spacy.load("en_core_web_sm")

In [123]:
## load data in, drop non relevant cols, can be later merged on id
posts = pd.read_csv("data/raw/total_posts.csv")[["id", "title", "selftext"]]

In [124]:
def preprocess_text(text):
    """
    Preprocesses the given text by performing the following steps:
    1. Detects the language of the text using a language detection library.
    2. Filters out non-English text; returns (pd.NA, pd.NA) if the detected language is not English ('en').
    3. Processes the text using spaCy in batch mode:
       - Converts the text to lowercase.
       - Tokenizes the text.
       - Filters out stop words, URLs, and non-alphabetic tokens.
       - Creates a cleaned version of the text without stop words and URLs.
       - Creates a lemmatized version of the cleaned text.

    Parameters:
    text (str): The input text to preprocess.

    Returns:
    tuple: A tuple containing two strings:
        - cleaned (str): The cleaned version of the input text.
        - lemmatized (str): The lemmatized version of the cleaned text.
        Returns (pd.NA, pd.NA) if the language is not English or if an error occurs during processing.
    """
    try:
        # Check if the text is deleted, removed, empty, or None
        if text in ['[deleted]', '[removed]', '', None]:
            return "", ""

        # Convert text to lowercase before language detection
        text = text.lower()

        # Detect the language of the text
        lang = detect(text)

        # Return NA if the detected language is not English
        if lang != 'en':
            return pd.NA, pd.NA

        # Replace specific HTML junk (e.g., '&gt;')
        text = text.replace('&gt;', ' ')

        # Substitute Reddit usernames (e.g., 'u/username') with a placeholder
        text = re.sub(r"u/\w+", "username", text)

        # Process the text using spaCy in batch mode, disabling NER and parser to save time
        doc = list(nlp.pipe([text], disable=["ner", "parser"]))[0]

        # Initialize a list to store cleaned tokens
        cleaned_tokens = []
        for token in doc:
            # Include only alphabetic tokens or punctuation ('?' and '!') and exclude URLs
            if (token.is_alpha or token.text in ['?', '!']) and not token.like_url:
                cleaned_tokens.append(token)

        # Join the cleaned tokens into a single string
        cleaned = ' '.join([token.text for token in cleaned_tokens])
        # Join the lemmatized forms of the tokens into a single string
        lemmatized = ' '.join([token.lemma_ for token in cleaned_tokens])

        return cleaned, lemmatized

    except Exception as e:
        # Return empty strings in case of any errors during processing
        return "", ""

In [125]:
# Apply the preprocessing function to the 'title' column of the posts DataFrame
# The 'preprocess_text' function returns a tuple, so we unpack it into two new columns: 'title_cleaned' and 'title_lemmatized'
posts[['title_cleaned', 'title_lemmatized']] = posts['title'].apply(lambda x: pd.Series(preprocess_text(x)))

# Apply the preprocessing function to the 'selftext' column of the posts DataFrame
# Similarly, unpack the tuple returned by 'preprocess_text' into 'selftext_cleaned' and 'selftext_lemmatized'
posts[['selftext_cleaned', 'selftext_lemmatized']] = posts['selftext'].apply(lambda x: pd.Series(preprocess_text(x)))

# Clean the 'selftext' column by replacing NaN values and markers of deleted/removed text with an empty string
posts["selftext"] = posts["selftext"].apply(lambda x: "" if pd.isna(x) or x in ['[deleted]', '[removed]'] else x)

# Combine the 'title' and 'selftext' columns into a new column 'title_and_text'
posts["title_and_text"] = posts["title"] + " " + posts["selftext"]

# Combine the cleaned versions of 'title' and 'selftext' into 'title_and_text_cleaned'
posts["title_and_text_cleaned"] = posts["title_cleaned"] + " " + posts["selftext_cleaned"]

# Combine the lemmatized versions of 'title' and 'selftext' into 'title_and_text_lemmatized'
posts["title_and_text_lemmatized"] = posts["title_lemmatized"] + " " + posts["selftext_lemmatized"]

# Drop any rows that contain NaN values after preprocessing
posts = posts.dropna()

# Reset the index of the DataFrame after dropping rows, ensuring that the index is sequential
posts.reset_index(drop=True, inplace=True)

# Save the preprocessed DataFrame to a CSV file
posts.to_csv("data/preprocessed/total_posts.csv", index=False)