# Unified QAnon vs Non-QAnon Processing Pipeline

**Purpose:** Load two datasets (QAnon and Non-QAnon), clean both identically, filter by flag-words for QAnon, detect & remove bot-like authors, aggregate by author, label, balance, and finally run TF-IDF + PCA + KMeans clustering for exploratory analysis.

**Usage:** Edit the file paths in the `CONFIG` cell and run the notebook. The notebook is modular so you can run sections independently.

In [None]:
# CONFIG - edit paths and parameters here
QANON_CSV = "/content/drive/MyDrive/Final/data_raw/Hashed_Q_Submissions_Raw_Combined.csv"         # path to qanon csv (change if needed)
NONQ_CSV = "/content/drive/MyDrive/Final/data_raw/Non_QAnon_Authors_raw.csv"     # path to non-qanon csv (change if needed)
OUTPUT_DIR = "/content/drive/MyDrive/Final/data_inbetween/"        # where outputs will be saved

# Processing parameters
MIN_FLAG_WORD_HITS = 4      # minimum number of flag word occurrences per author to keep (QAnon)
BOT_POSTS_THRESHOLD = 600   # e.g., posts per ~month threshold for bot-like activity
BOT_MIN_SECONDS = 5         # min seconds between posts to flag as bot
RANDOM_STATE = 42

# Text columns expected in input CSVs - change to match your files
AUTHOR_COL = "author"
TITLE_COL = "title"
TEXT_COL = "text"
DATE_COL = "created_utc"    # optional, used for bot detection if present

# Flag words (example list) - extend/modify as needed.
QANON_FLAG_WORDS = [
    "qanon", "wwg1wga", "deep state", "storm", "cabal", "great awakening",
    "q", "save the children", "patriots", "trump train", "pedogate", "plan", "cabal", "trust the plan",
    "breadcrumbs", "covid1984", "disclosure", "forced penetration",
    "microchipped", "pizzagate", "red pill", "sheeple", "sovereignty", "storm", "maga", "cabal", "indictments", "adrenochrome", "satanic"
]

# Create output directory if not exists
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Config set. QAnon CSV: {QANON_CSV}\nNon-Q CSV: {NONQ_CSV}\nOutputs: {OUTPUT_DIR}")

Config set. QAnon CSV: /content/drive/MyDrive/Final/data_raw/Hashed_Q_Submissions_Raw_Combined.csv
Non-Q CSV: /content/drive/MyDrive/Final/data_raw/Non_QAnon_Authors_raw.csv
Outputs: /content/drive/MyDrive/Final/data_inbetween/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Imports and helper functions
import re, os, gc
import pandas as pd
import numpy as np
from typing import List, Tuple, Optional, Dict

# sklearn for later
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score
from sklearn.model_selection import train_test_split

# simple text cleaning function
def clean_text(text: Optional[str]) -> str:
    if pd.isna(text):
        return ""
    s = str(text)
    s = s.lower()
    #remove [removed], if a post was removed from reddit
    if s == "[removed]":
        return ""
    # remove urls
    s = re.sub(r"http\S+|www\.\S+", " ", s)
    # remove markdown, special tokens, punctuation (keep spaces)
    s = re.sub(r"[^\w\s]", " ", s)
    # collapse whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

def clean_dataframe(df: pd.DataFrame, text_cols: List[str]) -> pd.DataFrame:
    df = df.copy()
    for c in text_cols:
        if c in df.columns:
            df[c] = df[c].astype(str).fillna("").map(clean_text)
        else:
            df[c] = ""
    # create a unified text column used later
    df['combined_text_raw'] = (df.get(TITLE_COL, "") + ' ' + df.get(TEXT_COL, "")).astype(str)
    df['combined_text_raw'] = df['combined_text_raw'].map(clean_text)
    return df

def detect_bot_authors(df: pd.DataFrame,
                       author_col: str = None,
                       date_col: Optional[str] = None,
                       posts_threshold: int = 600,
                       min_seconds_between_posts: int = 5) -> pd.Series:
    """Return a boolean mask (index=author) True if author is bot-like based on simple heuristics."""
    if author_col is None:
        author_col = 'author'
    if date_col is None:
        date_col = 'created_utc'
    authors = []
    # First, posts per author
    posts_per_author = df.groupby(author_col).size()
    flag_by_volume = posts_per_author[posts_per_author > posts_threshold].index.tolist()

    # Second, rapid-posting heuristic if date_col exists
    flag_by_rapid = []
    if date_col in df.columns and not df[date_col].isna().all():
        # attempt to coerce to datetime
        try:
            df_temp = df.copy()
            df_temp[date_col] = pd.to_datetime(df_temp[date_col], unit='s', errors='coerce')
            # compute min delta between consecutive posts per author
            def min_delta_seconds(subdf):
                s = subdf[date_col].dropna().sort_values()
                if len(s) < 2:
                    return np.inf
                return s.diff().dt.total_seconds().min()
            min_deltas = df_temp.groupby(author_col).apply(min_delta_seconds)
            flag_by_rapid = min_deltas[min_deltas <= min_seconds_between_posts].index.tolist()
        except Exception as e:
            # if conversion fails, skip rapid detection
            pass

    flagged = set(flag_by_volume) | set(flag_by_rapid)
    return pd.Series({a: (a in flagged) for a in df[author_col].unique()})


def aggregate_by_author(df: pd.DataFrame, author_col: str = None, text_col: str = 'combined_text_raw') -> pd.DataFrame:
    if author_col is None:
        author_col = 'author'
    df2 = df.groupby(author_col)[text_col].apply(lambda texts: ' '.join(texts.dropna().astype(str))).reset_index()
    df2 = df2.rename(columns={text_col: 'text'})
    df2['num_posts'] = df.groupby(author_col).size().values
    return df2


def apply_flag_filter(df_authors: pd.DataFrame, flag_words: List[str], min_hits:int=4) -> pd.DataFrame:
    """Keep only authors whose aggregated text contains at least `min_hits` flag word occurrences."""
    pattern = r"(" + r"|".join([re.escape(w.lower()) for w in flag_words]) + r")"
    def count_flags(s):
        return len(re.findall(pattern, s.lower()))
    df = df_authors.copy()
    df['flag_hits'] = df['text'].map(count_flags)
    return df[df['flag_hits'] >= min_hits].sort_values('flag_hits', ascending=False)

# balancing helper
def balance_and_shuffle(df_q: pd.DataFrame, df_n: pd.DataFrame, random_state:int=42) -> pd.DataFrame:
    n_q = len(df_q)
    n_n = len(df_n)
    n = min(n_q, n_n)
    df_qs = df_q.sample(n=n, random_state=random_state)
    df_ns = df_n.sample(n=n, random_state=random_state)
    combined = pd.concat([df_qs, df_ns], ignore_index=True).sample(frac=1, random_state=random_state).reset_index(drop=True)
    return combined

print('Helpers loaded.')

Helpers loaded.


In [None]:
# Load CSVs and apply initial cleaning
import os

def load_and_clean(path: str, text_cols: List[str]=None) -> pd.DataFrame:
    if text_cols is None:
        text_cols = [TITLE_COL, TEXT_COL]
    if not os.path.exists(path):
        print(f"File not found: {path}")
        return pd.DataFrame()
    df = pd.read_csv(path)
    print(f"Loaded {path} with shape {df.shape}")
    df = clean_dataframe(df, text_cols)
    print(f"After cleaning: {df.shape}")
    return df

# Try to load both - if files do not exist, the returned DataFrames will be empty and the notebook will still be runnable.
df_qanon_raw = load_and_clean(QANON_CSV)
df_nonq_raw = load_and_clean(NONQ_CSV)

# Quick preview
for name, df in [('QAnon', df_qanon_raw), ('Non-Qanon', df_nonq_raw)]:
    if df.empty:
        print(f"{name} dataframe is empty. Please place your CSV at the configured path.")
    else:
        print(f"{name} sample:\n", df.head(2).T)

# Save cleaned intermediate files
if not df_qanon_raw.empty:
    df_qanon_raw.to_csv(os.path.join(OUTPUT_DIR, 'cleaned_qanon_raw.csv'), index=False)
if not df_nonq_raw.empty:
    df_nonq_raw.to_csv(os.path.join(OUTPUT_DIR, 'cleaned_nonq_raw.csv'), index=False)
print('Initial cleaned CSVs saved to output dir (if loaded).')

  df = pd.read_csv(path)


Loaded /content/drive/MyDrive/Final/data_raw/Hashed_Q_Submissions_Raw_Combined.csv with shape (2775263, 13)
After cleaning: (2775263, 14)
Loaded /content/drive/MyDrive/Final/data_raw/Non_QAnon_Authors_raw.csv with shape (134154, 14)
After cleaning: (134154, 14)
QAnon sample:
                                                                    0  \
subreddit                                             greatawakening   
id                                                            8xuv4i   
score                                                              1   
numReplies                                                        14   
author                      879f283b831c13474e219e88663d95b0763cca9b   
title              i ve been writing trump lives here on my 20 s ...   
text                                                             nan   
is_self                                                        False   
domain                                                     i.redd.it   
url

In [None]:
# Bot detection and author aggregation

def process_df_remove_bots_and_aggregate(df_raw: pd.DataFrame, label:int) -> pd.DataFrame:
    if df_raw.empty:
        return pd.DataFrame()
    # detect bot authors
    bot_mask = detect_bot_authors(df_raw, author_col=AUTHOR_COL, date_col=DATE_COL, posts_threshold=BOT_POSTS_THRESHOLD, min_seconds_between_posts=BOT_MIN_SECONDS)
    # get list of bot authors
    bot_authors = [a for a,b in bot_mask.items() if b]
    print(f"Detected {len(bot_authors)} bot-like authors (will be removed).")
    df_filtered = df_raw[~df_raw[AUTHOR_COL].isin(bot_authors)].copy()
    print(f"After removing bots: {df_filtered.shape}")
    # aggregate
    df_authors = aggregate_by_author(df_filtered, author_col=AUTHOR_COL, text_col='combined_text_raw')
    df_authors['label'] = label
    return df_authors

processed_qanon = process_df_remove_bots_and_aggregate(df_qanon_raw, label=1)
processed_nonq = process_df_remove_bots_and_aggregate(df_nonq_raw, label=0)

print('Processed authors:')
print('QAnon authors:', len(processed_qanon))
print('Non-QAnon authors:', len(processed_nonq))

# Save
if not processed_qanon.empty:
    processed_qanon.to_csv(os.path.join(OUTPUT_DIR, 'processed_qanon_authors.csv'), index=False)
if not processed_nonq.empty:
    processed_nonq.to_csv(os.path.join(OUTPUT_DIR, 'processed_nonqanon_authors.csv'), index=False)
print('Processed author-level files saved (if data present).')

Detected 398 bot-like authors (will be removed).
After removing bots: (1214452, 14)
Detected 0 bot-like authors (will be removed).
After removing bots: (134154, 14)
Processed authors:
QAnon authors: 12786
Non-QAnon authors: 2714
Processed author-level files saved (if data present).


In [None]:
# Flag-word filtering (applied only to QAnon processed authors)
if processed_qanon.empty:
    print('No QAnon data to filter (processed_qanon is empty).')
else:
    filtered_qanon = apply_flag_filter(processed_qanon, QANON_FLAG_WORDS, min_hits=MIN_FLAG_WORD_HITS)
    print(f"After flag-word filtering: {len(filtered_qanon)} authors kept (min_hits={MIN_FLAG_WORD_HITS}).")
    filtered_qanon.to_csv(os.path.join(OUTPUT_DIR, 'filtered_qanon_authors.csv'), index=False)

# If no qanon data, set filtered_qanon to processed_qanon for downstream steps
try:
    filtered_qanon
except NameError:
    filtered_qanon = processed_qanon

# Keep processed_nonq unchanged for now
filtered_nonq = processed_nonq
print('Flag-word filtering step done.')

After flag-word filtering: 7173 authors kept (min_hits=4).
Flag-word filtering step done.


In [None]:
# Balance and shuffle to create final labeled dataset
if filtered_qanon.empty or filtered_nonq.empty:
    print('One of the groups is empty; skipping balancing step.')
    final_df = pd.DataFrame()
else:
    final_df = balance_and_shuffle(filtered_qanon, filtered_nonq, random_state=RANDOM_STATE)
    # keep only author, text, label and metadata
    cols = ['author', 'text', 'label', 'num_posts']
    if 'flag_hits' in final_df.columns:
        cols.append('flag_hits')
    final_df = final_df[cols]
    print(f"Final balanced dataset shape: {final_df.shape}")
    final_csv_path = os.path.join(OUTPUT_DIR, 'training_ready_balanced.csv')
    final_df.to_csv(final_csv_path, index=False)
    print(f"Saved balanced training CSV to: {final_csv_path}")

# If final_df is empty, give instructions
if final_df.empty:
    print('\nNo final dataset created. Make sure your input CSV files are present and the cleaning parameters fit your data.')
else:
    display(final_df.head())

Final balanced dataset shape: (5428, 5)
Saved balanced training CSV to: /content/drive/MyDrive/Final/data_inbetween/training_ready_balanced.csv


Unnamed: 0,author,text,label,num_posts,flag_hits
0,29be7cf1262c85ee8da744037511617ac8945701,this crack in the ice nan giveaway discount co...,1,271,25.0
1,MrsNoPants420,now that a va senator has died will they hold ...,0,87,
2,6748febe773c1758b207c72a75a1284a7c171298,just got my airpods today i must say this is o...,1,99,11.0
3,ea919a9ee76fedf7b3e77eac7ef930ad8af483af,what happens when you re an islamist who was b...,1,122,55.0
4,e6f9da0578c4c806da8f26772d147d1663bdc8cd,new to redditt here karma s a bitch i was worr...,1,91,20.0


In [None]:
#find out how many posts are in the dataset --> varies because of shuffle
print(f"Total number of posts in the dataset: {final_df['num_posts'].sum()}")

Total number of posts in the dataset: 332027


## Notes

- If your CSVs have different column names or date formats, edit the `CONFIG` cell at the top.
- This notebook intentionally leaves intermediate file saves so you can inspect cleaned and processed data.
- Tweak `MIN_FLAG_WORD_HITS` and bot thresholds to fit your data.

---

**Next steps (optional):** Train supervised classifiers (logistic regression, transformers) on `training_ready_balanced.csv` and evaluate. Use `flag_hits` or `num_posts` as additional features if desired.