In [4]:
import pandas as pd
import os
import html
import re

In [5]:
def clean_post_only_data(file_path, subreddit_name, output_folder, max_rows=7000):
    # Manually define column names (no header in CSV)
    column_names = ["score", "date", "title", "author", "url", "post_text"]
    df = pd.read_csv(file_path, names=column_names, header=None)

    # Remove accidental duplicate header row if present
    if df.iloc[0].equals(pd.Series(column_names)):
        df = df.drop(index=0)

    # Clean author
    df["author"] = df["author"].astype(str).str.replace("u/", "", regex=False)

    # Decode HTML, remove weird characters
    def clean_text(text):
        text = html.unescape(str(text)).strip()
        text = text.encode("utf-8", "ignore").decode("utf-8", "ignore")
        return text

    df["title"] = df["title"].apply(clean_text)
    df["post_text"] = df["post_text"].apply(clean_text)

    # Drop null or bad rows
    df = df.dropna(subset=["date", "title", "author", "url", "post_text"])
    df = df[~df["post_text"].isin(["[removed]", "[deleted]"])]

    # Remove media-only posts
    df = df[~df["post_text"].str.lower().str.startswith(("http", "www"))]
    df = df[~df["post_text"].str.contains(r"\.jpg|\.png|youtube\.com|youtu\.be", case=False, na=False)]

    # Convert date
    df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=True)
    df = df.dropna(subset=["date"])

    # Add subreddit
    df["subreddit"] = subreddit_name

    # Sort and select top 7000
    df = df.sort_values("date", ascending=False).head(max_rows)

    # Export
    out_path = os.path.join(output_folder, f"cleaned_{subreddit_name}.csv")
    df.to_csv(out_path, index=False)
    print(f"âœ… Cleaned and saved: {out_path}")

In [6]:
raw_folder = "/Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Uncleaned Data"
output_folder = "/Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data"

for file in os.listdir(raw_folder):
    if file.endswith(".csv"):
        file_path = os.path.join(raw_folder, file)
        subreddit = file.replace("_submissions.csv", "").replace(".csv", "")
        clean_post_only_data(file_path, subreddit, output_folder)


âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_traumatoolbox.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_HealthAnxiety.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_AnxietyDepression.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_alcoholism.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_BPD.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_SanctionedSuicide.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cl

  df["date"] = pd.to_datetime(df["date"], errors="coerce", dayfirst=True)


âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_ADHD_Programmers.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_SuicideWatch.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_MentalHealthSupport.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_depression.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_offmychest.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data/cleaned_SuicideBereavement.csv
âœ… Cleaned and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cle

In [7]:
def filter_generic_posts(input_folder, output_folder_filtered, min_words=10):
    os.makedirs(output_folder_filtered, exist_ok=True)
    for file in os.listdir(input_folder):
        if file.endswith(".csv"):
            file_path = os.path.join(input_folder, file)
            df = pd.read_csv(file_path)

            # Drop missing or blank post_text
            df = df.dropna(subset=["post_text"])
            df["post_text"] = df["post_text"].astype(str).str.strip()
            df = df[df["post_text"] != ""]

            # Define patterns to filter
            generic_patterns = [
                r"\banyone\s+(want|up|here|around)\b",
                r"\bchat(ting)?\b",
                r"^hi\b", r"^hello\b", r"^hey\b",
                r"what'?s up", r"^yo\b",
                r"\bneed someone to talk\b",
                r"\bwho'?s (feeling|here)\b",
            ]
            combined_pattern = re.compile("|".join(generic_patterns), re.IGNORECASE)

            # Filter out generic posts
            df = df[~df["post_text"].str.contains(combined_pattern)]

            # Filter out very short posts
            df = df[df["post_text"].str.split().str.len() >= min_words]

            # Save final file
            out_path = os.path.join(output_folder_filtered, f"filtered_{file}")
            df.to_csv(out_path, index=False)
            print(f"ðŸ§¹ Filtered and saved: {out_path}")


In [8]:
input_cleaned = "/Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Cleaned Data"
output_filtered = "/Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data"

filter_generic_posts(input_cleaned, output_filtered)

  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_depression.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_SuicideBereavement.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_alcoholism.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_lonely.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_offmychest.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_MentalHealthSupport.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_mentalhealth.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_SuicideWatch.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_ADHDparenting.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_AnxietyDepression.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_ADHD_Programmers.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_ADHD.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_socialanxiety.csv
ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_SanctionedSuicide.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]
  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_traumatoolbox.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_depression_help.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_HealthAnxiety.csv
ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_dbtselfhelp.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]
  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_Postpartum_Depression.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_ptsd.csv


  df = df[~df["post_text"].str.contains(combined_pattern)]


ðŸ§¹ Filtered and saved: /Users/tusharbansal/Study (UoA)/Trimester 5/Research Project A/Reddit Data Gathering/Filtered Data/filtered_cleaned_BPD.csv
