## Reddit Sentiment Analysis Test

In [None]:
import dotenv, os
dotenv.load_dotenv()        # picks up .env in the working dir (or parent)
print(os.getenv("REDDIT_CLIENT_ID")[:4])   # sanity-check

In [None]:
# --- .env loader that works in notebooks ------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys

def find_repo_root(start: Path, marker=".git") -> Path:
    """Walk up until we see a folder containing the given marker ('.git' or '.env')."""
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / marker).exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError(f"Repository root with {marker} not found from {start}")

# 1) locate repo root (folder that has .env **or** .git)
repo_root = find_repo_root(Path.cwd(), ".env")

# 2) load environment variables
load_dotenv(repo_root / ".env")

# 3) add src/ to Python path (optional, if you’ll import from src/)
src_path = repo_root / "src"
if src_path.exists():
    sys.path.append(str(src_path))

# 4) fetch secrets (raise fast if any missing)
REQUIRED = ["OPENAI_API_KEY", "NEWSAPI_KEY"]
CREDS = {k: os.getenv(k) for k in REQUIRED}
missing = [k for k, v in CREDS.items() if not v]
if missing:
    raise RuntimeError(f"Missing secrets in .env: {', '.join(missing)}")

# handy variables
OPENAI_KEY       = CREDS["OPENAI_API_KEY"]
NEWSAPI_KEY      = CREDS["NEWSAPI_KEY"]
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET    = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_AGENT     = os.getenv("REDDIT_USER_AGENT")

print(f"✅  .env loaded from {repo_root}")

In [None]:
import os, datetime as dt, json, time, textwrap, math
from dotenv import load_dotenv
import pandas as pd
from tqdm.auto import tqdm
import openai, praw

reddit = praw.Reddit(
    client_id=os.getenv("REDDIT_CLIENT_ID"),
    client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
    user_agent=os.getenv("REDDIT_USER_AGENT"),
)

# ----------  OpenAI prompt formatter ----------
SYSTEM_PROMPT = """You are an analyst measuring public sentiment toward
U.S./NATO military support for Ukraine. 
Rate the text on a 5-point scale:
 -2 = strongly against Western aid,
 -1 = somewhat against,
  0 = neutral/unclear,
 +1 = somewhat supportive,
 +2 = strongly supportive.
Return ONLY the integer."""

def get_sentiment(text: str, model="gpt-4o-mini") -> int:
    resp = openai.chat.completions.create(
        model=model,
        messages=[{"role":"system","content":SYSTEM_PROMPT},
                  {"role":"user","content":text[:4000]}],   # keep under limit
        temperature=0,
        max_tokens=2,
    )
    return int(resp.choices[0].message.content.strip())


### Pull Small Trial of Posts & Comments

In [None]:
# -------- parameters you can tweak ----------
SUBREDDITS   = ["ukraine", "worldnews", "geopolitics"]
KEYWORDS     = ["nato", "western aid", "military support", "leopard", "patriot"]
START = dt.datetime.utcnow() - dt.timedelta(days=3)   # last 3 days
END   = dt.datetime.utcnow()
LIMIT_PER_SUB = 150                          # stays under Reddit's 100 QPM cap :contentReference[oaicite:2]{index=2}&#8203;:contentReference[oaicite:3]{index=3}
# --------------------------------------------

for k in ("REDDIT_CLIENT_ID", "REDDIT_CLIENT_SECRET", "REDDIT_USER_AGENT"):
    print(f"{k} →", repr(os.getenv(k)))

for p in reddit.subreddit("worldnews").search(
        '"military support"', sort="new", limit=3):
    print(p.created_utc, p.title[:80])

rows = []
for sub in SUBREDDITS:
    sr = reddit.subreddit(sub)
    query = " OR ".join(f'"{k}"' for k in KEYWORDS)
    for post in sr.search(query, sort="new", time_filter="all", limit=LIMIT_PER_SUB):
        if not (START <= dt.datetime.utcfromtimestamp(post.created_utc) < END):
            continue
        rows.append(
            {"id": post.id,
             "created": dt.datetime.utcfromtimestamp(post.created_utc),
             "sub": sub,
             "type": "post",
             "text": post.title + "\n" + (post.selftext or "")}
        )
        post.comments.replace_more(limit=0)
        for com in post.comments.list()[:50]:          # cap for demo
            rows.append(
                {"id": com.id,
                 "created": dt.datetime.utcfromtimestamp(com.created_utc),
                 "sub": sub,
                 "type": "comment",
                 "text": com.body}
            )
df = pd.DataFrame(rows)
print(f"Collected {len(df):,} rows")

In [None]:
df.head(3)[["created","sub","type","text"]]
df["sub"].value_counts()

In [None]:
from datetime import datetime, timedelta

from tqdm.auto import tqdm
import time, datetime as dt

ONE_MONTH = timedelta(days=31)

def month_windows(start, end):
    t = end
    while t > start:
        yield (t - dt.timedelta(days=31), t)
        t -= dt.timedelta(days=31)

def collect_window(sub, query, t0, t1, limit=500):
    rows, after = [], None
    with tqdm(total=limit,
              desc=f"{sub:<15} {t0:%Y-%m}",
              leave=False,
              unit="row") as bar:
        while True:
            batch = list(
                reddit.subreddit(sub).search(
                    query,
                    sort="new",
                    limit=100,
                    params={"after": after} if after else {}
                )
            )
            if not batch:
                break
            for post in batch:
                ts = dt.datetime.utcfromtimestamp(post.created_utc)
                if not (t0 <= ts < t1):
                    continue
                rows.append({
                    "id": post.id,
                    "created": ts,
                    "sub": sub,
                    "type": "post",
                    "text": post.title + "\n" + (post.selftext or "")
                })
                bar.update(1)
                if len(rows) >= limit:
                    break
            after = batch[-1].fullname
            if len(rows) >= limit:
                break
    return rows



In [None]:
SUBREDDITS = [
    "worldnews","geopolitics","news",
    "ukraine","RussiaUkraineWar2022",
    "UkrainianConflict","RussiaLago"
]
KEYWORDS = ["nato","western aid","military aid","f-16","leopard","patriot"]
query = " OR ".join(f'"{k}"' for k in KEYWORDS)

START = datetime.utcnow() - timedelta(days=365)
END   = datetime.utcnow()

rows = []
for sub in tqdm(SUBREDDITS, desc="Subreddits"):
    for t0, t1 in tqdm(list(month_windows(START, END)),
                       desc=f"{sub} months", leave=False):
        rows.extend(collect_window(sub, query, t0, t1))
        time.sleep(0.5)          # polite pause

df = pd.DataFrame(rows)
df.to_csv("reddit_raw_v1.csv", index=False)
print("Total rows:", len(df))

In [None]:
print(len(df))        # how many rows survived
df.head(5)            # first five
df.tail(5)            # last five
df["sub"].value_counts()

# 1.  give the current DataFrame a permanent home
df.to_parquet("reddit_raw_v1.parquet")      # works now that pyarrow is installed
# or, if you prefer plain CSV
# df.to_csv("reddit_raw_v1.csv", index=False)

# 2.  quick sanity-peek
import pandas as pd
peek = pd.read_parquet("reddit_raw_v1.parquet")
print(len(peek), "rows on disk")


### Grab Quick Review Set for Prompt Engineering

In [None]:
# ------------------------------------------------------------------
# Reload clean file so we start from the same baseline every session
# ------------------------------------------------------------------
import pandas as pd, pathlib

DATA_FNAME = "reddit_raw_v1.csv"          # or .parquet if you installed pyarrow
if DATA_FNAME.endswith(".csv"):
    df = pd.read_csv(DATA_FNAME)
else:
    df = pd.read_parquet(DATA_FNAME)

print("Loaded", len(df), "rows from disk")


In [None]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt
from pathlib import Path

# --- 1. make sure dates are real datetime ------------------------
df["created"] = pd.to_datetime(df["created"], errors="coerce")
df = df.dropna(subset=["created"])          # drop any bad rows

# plain string “YYYY-MM” for robust comparisons
df["ym"] = df["created"].dt.strftime("%Y-%m")

# quick overview
print("unique subs :", df['sub'].unique())
print("months span:", df['ym'].min(), "→", df['ym'].max())
print("rows        :", len(df))

# ---- 2. balanced 50-row sampler (fixed) -------------------------
TARGET = 50
strata = df.groupby(["sub", "ym"]).size().reset_index(name="n")

rng      = np.random.default_rng(42)
indices  = []

N = len(df)                       # 19 194 rows

for _, row in strata.iterrows():
    grp_mask = (df["sub"] == row["sub"]) & (df["ym"] == row["ym"])
    grp_idx  = df.index[grp_mask]

    if grp_idx.empty:
        continue

    share = row["n"] / N
    want  = max(1, int(np.ceil(TARGET * share)))
    want  = min(want, len(grp_idx))          # len() is an int → safe

    pick  = rng.choice(grp_idx, size=want, replace=False)
    indices.extend(pick)

print("collected", len(indices), "rows before trim")

review_df = (df.loc[indices]
               .sample(n=TARGET, random_state=1)     # final trim/shuffle
               .reset_index(drop=True))

print("✅ review_df length =", len(review_df))

review_df[["created","sub","type","text"]].to_csv("reddit_review_set.csv", index=False)
           
print("✅ review set saved → reddit_review_set.csv")


# --- 3. quick bar-chart -----------------------------------------
(review_df.groupby("sub").size()
          .sort_values()
          .plot.barh(figsize=(6,3), alpha=.7))
plt.title("Rows per subreddit in review sample")
plt.tight_layout(); plt.show()


### Quick Pipeline for Tagging Test

In [None]:
# --- Step 1: write review set for manual labeling -----------------
REVIEW_FPATH = "reddit_review_set.csv"   # already on disk – just remind yourself
print("➡️  Open this file in Excel/Sheets, add a column 'label_human' (–2…+2),",
      "and save.  15–20 min of quick coding.")