In [122]:
import numpy as np
import spacy
import pandas as pd
import subprocess
from langdetect import detect
import re

## install english spacy model (if not already installed) and load
try:
    nlp = spacy.load("en_core_web_sm")
except:
    subprocess.run(['python', '-m', 'spacy', 'download', 'en_core_web_sm'])
    nlp = spacy.load("en_core_web_sm")

In [123]:
## load data in, drop non relevant cols, can be later merged on id
posts = pd.read_csv("data/raw/total_posts.csv")[["id", "title", "selftext"]]

In [124]:
def preprocess_text(text):
    """
    Preprocesses the given text by performing the following steps:
    1. Detects the language of the text using a language detection library.
    2. Filters out non-English text; returns (pd.NA, pd.NA) if the detected language is not English ('en').
    3. Processes the text using spaCy in batch mode:
       - Converts the text to lowercase.
       - Tokenizes the text.
       - Filters out stop words, URLs, and non-alphabetic tokens.
       - Creates a cleaned version of the text without stop words and URLs.
       - Creates a lemmatized version of the cleaned text.

    Parameters:
    text (str): The input text to preprocess.

    Returns:
    tuple: A tuple containing two strings:
        - cleaned (str): The cleaned version of the input text.
        - lemmatized (str): The lemmatized version of the cleaned text.
        Returns (pd.NA, pd.NA) if the language is not English or if an error occurs during processing.
    """
    try:
        if text in ['[deleted]', '[removed]', '', None]:
            return "", ""

        # Convert to lower case before language detection
        text = text.lower()

        # Detect language
        lang = detect(text)

        # Keep only English text, return NA otherwise
        if lang != 'en':
            return pd.NA, pd.NA

        # Remove HTML junk
        text = text.replace('&gt;', ' ')

        # Substitute usernames
        text = re.sub(r"u/\w+", "username", text)

        # Disable parser and named entity recognition to save time, process in batches
        doc = list(nlp.pipe([text], disable=["ner", "parser"]))[0]

        cleaned_tokens = []
        for token in doc:
            if (token.is_alpha or token.text in ['?', '!']) and not token.like_url:
                cleaned_tokens.append(token)

        cleaned = ' '.join([token.text for token in cleaned_tokens])
        lemmatized = ' '.join([token.lemma_ for token in cleaned_tokens])

        return cleaned, lemmatized

    except Exception as e:
        return "", ""

In [125]:
# Apply the preprocessing function to the posts DataFrame
posts[['title_cleaned', 'title_lemmatized']] = posts['title'].apply(lambda x: pd.Series(preprocess_text(x)))
posts[['selftext_cleaned', 'selftext_lemmatized']] = posts['selftext'].apply(lambda x: pd.Series(preprocess_text(x)))

posts["selftext"] = posts["selftext"].apply(lambda x: "" if pd.isna(x) or x in ['[deleted]', '[removed]'] else x)
posts["title_and_text"] = posts["title"] + " " + posts["selftext"]
posts["title_and_text_cleaned"] = posts["title_cleaned"] + " " + posts["selftext_cleaned"]
posts["title_and_text_lemmatized"] = posts["title_lemmatized"] + " " + posts["selftext_lemmatized"]
posts = posts.dropna()
posts.reset_index(drop=True, inplace=True)

In [126]:
posts.to_csv("data/preprocessed/total_posts.csv", index=False)

In [127]:
posts

Unnamed: 0,id,title,selftext,title_cleaned,title_lemmatized,selftext_cleaned,selftext_lemmatized,title_and_text,title_and_text_cleaned,title_and_text_lemmatized
0,x2smmw,“Cities without water” is our future. The lead...,,cities without water is our future the leaders...,city without water be our future the leader be...,,,“Cities without water” is our future. The lead...,cities without water is our future the leaders...,city without water be our future the leader be...
1,x2slxy,Sharks and climate change,Ok so I was having a conversation with my neig...,sharks and climate change,shark and climate change,ok so i was having a conversation with my neig...,ok so I be have a conversation with my neighbo...,Sharks and climate change Ok so I was having a...,sharks and climate change ok so i was having a...,shark and climate change ok so I be have a con...
2,x2slk5,How to start a climate change essay,,how to start a climate change essay,how to start a climate change essay,,,How to start a climate change essay,how to start a climate change essay,how to start a climate change essay
3,x2sbz6,Ever notice that when it’s hot a few days in J...,,ever notice that when it hot a few days in jul...,ever notice that when it hot a few day in july...,,,Ever notice that when it’s hot a few days in J...,ever notice that when it hot a few days in jul...,ever notice that when it hot a few day in july...
4,x2sa2a,How would you react if the next American presi...,,how would you react if the next american presi...,how would you react if the next american presi...,,,How would you react if the next American presi...,how would you react if the next american presi...,how would you react if the next american presi...
...,...,...,...,...,...,...,...,...,...,...
604738,aki85,Climate Change: Five Easy New Year’s Resolutions,,climate change five easy new year resolutions,climate change five easy new year resolution,,,Climate Change: Five Easy New Year’s Resolutions,climate change five easy new year resolutions,climate change five easy new year resolution
604739,akgtp,2009 climate change story of the year - Climat...,,climate change story of the year climategate,climate change story of the year climategate,,,2009 climate change story of the year - Climat...,climate change story of the year climategate,climate change story of the year climategate
604740,akgkb,Statistics: Scientific Consensus on Climate Ch...,,statistics scientific consensus on climate cha...,statistic scientific consensus on climate chan...,,,Statistics: Scientific Consensus on Climate Ch...,statistics scientific consensus on climate cha...,statistic scientific consensus on climate chan...
604741,akgbz,Ultimate irony: Snowstorm squelches “screaming...,,ultimate irony snowstorm squelches screaming c...,ultimate irony snowstorm squelch scream climat...,,,Ultimate irony: Snowstorm squelches “screaming...,ultimate irony snowstorm squelches screaming c...,ultimate irony snowstorm squelch scream climat...
