In [None]:
# --- repo bootstrap ---------------------------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys
import subprocess, sys, importlib, os, re
from datetime import datetime
import truthbrush as tb

def repo_root(start: Path) -> Path:
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / ".env").exists() or (cur / ".git").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("repo root not found")

ROOT = repo_root(Path.cwd())
load_dotenv(ROOT / ".env")             # loads secrets
sys.path.append(str(ROOT / "src"))     # optional helpers

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs"
FIG_DIR  = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True)

print("Repo root:", ROOT)

In [None]:
# ────────── UNIVERSAL PATCH CELL  (run once, very top of notebook) ──────────
import subprocess, sys, importlib, os, types
from pathlib import Path

# 1️⃣  make sure both python-dotenv and curl_cffi exist
def ensure(pkg, src=None):
    try:
        importlib.import_module(pkg)
    except ModuleNotFoundError:
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "--quiet", src or pkg]
        )

ensure("python-dotenv")
ensure("curl_cffi")

# 2️⃣  reload .env (override=True guarantees fresh values)
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(usecwd=True), override=True)

# 3️⃣  import truthbrush and inject curl_cffi so NameError can’t happen
import curl_cffi                     # noqa:  F401  (needed for side-effect)
import truthbrush.api as tb_api
tb_api.curl_cffi = curl_cffi         # hand it to truthbrush’s module scope

import truthbrush as tb
print("✔ Patch cell finished – environment refreshed, curl_cffi wired\n")


In [None]:
import truthbrush as tb
from datetime import datetime, timezone
import random, time

# install SINGLE wrapper around _get  (skip if attribute exists)
if not hasattr(tb.api.Api, "_get_base"):
    tb.api.Api._get_base = tb.api.Api._get      # save original once

    def _polite_get(self, url, params=None):
        resp = self._get_base(url, params)
        # header-based sleep
        if (self.ratelimit_remaining is not None
                and self.ratelimit_remaining <= 10
                and self.ratelimit_reset):
            wait = max(
                0,
                (self.ratelimit_reset -
                 datetime.utcnow().replace(tzinfo=timezone.utc)).total_seconds()
            ) + random.uniform(1, 3)
            print(f"📉 near limit – sleeping {wait:.1f}s")
            time.sleep(wait)
        else:
            time.sleep(random.uniform(1.5, 3.0))
        return resp

    tb.api.Api._get = _polite_get
    print("✓ polite-delay wrapper installed")
else:
    print("✓ wrapper already present – no re-patch")


In [None]:
TOKEN = "29Sw-s-Bj5TITP-j7D2zYHuwpDe-trJn6AY5uZW0yD4"   # put your live token
api   = tb.Api(token=TOKEN)
api.auth_id = api.auth_id or ""
print("client ready")

print("lookup test:")
try:
    print(api.lookup("realDonaldTrump")["id"][:8], "… lookup OK")
except Exception as e:
    print("lookup failed:", e)


In [None]:
# ───────────────── scrape Truth Social & save CSV ──────────────────────────
import os, re, csv, pathlib, random, time, json, sys
from datetime import datetime, timezone
from dateutil import parser as dt_parse
from tqdm.auto import tqdm

# ---------- config ---------------------------------------------------------
KEYWORDS = [
    "russia", "russian", "ukraine", "ukrainian", "ru-uk", "putin",
    "zelensky", "zelenskyy", "kremlin", "kyiv", "crimea", "donbas",
    "mariupol", "kherson", "luhansk", "dnipro", "odessa", "invasion", "war",
]
SEED_HANDLES = [
    "realDonaldTrump", "TeamTrump", "TrumpWarRoom", "WhiteHouse", "PressSec"
]

def keyword_hit(html: str) -> bool:
    return any(k in html.lower() for k in KEYWORDS)

# ---------- resolve handles ------------------------------------------------
def canonical_handle(hint: str) -> str | None:
    try:
        info = api.lookup(user_handle=hint.lstrip("@"))
        return info.get("acct")
    except Exception:
        try:
            page = next(api.search("accounts", hint, limit=1))
            return page["accounts"][0]["acct"] if page["accounts"] else None
        except Exception:
            return None

handles = [h for h in (canonical_handle(x) for x in SEED_HANDLES) if h]
print("Scanning:", handles)

# ---------- scrape with live progress --------------------------------------
hits = []
try:
    for h in handles:
        print(f"\n↳ pulling @{h}")

        # get a generator *but* pull the first post immediately so you know it's alive
        gen = api.pull_statuses(username=h, replies=False, verbose=False)
        print("  … sending first request")
        try:
            first_post = next(gen)
            print("  ✓ first post received")
        except StopIteration:
            print("  ⚠️ no posts found for this account")
            continue

        # prepend that first item back into the stream
        def prepend_first(item, iterator):
            yield item
            yield from iterator
        gen = prepend_first(first_post, gen)

        pbar = tqdm(gen, unit="post", desc=f"{h}", leave=True)
        matched = 0
        for post in pbar:
            if post and post.get("content") and keyword_hit(post["content"]):
                matched += 1
                hits.append(
                    {
                        "created_at": post["created_at"],
                        "account"   : h,
                        "id"        : post["id"],
                        "text"      : re.sub(r"<[^>]+>", "", post["content"]).strip(),
                    }
                )
            if matched % 25 == 0:             # update label every 25 matches
                pbar.set_description(f"{h}  hits:{matched}")
        pbar.close()
        print(f"✓ @{h}: {matched} matches collected")

except KeyboardInterrupt:
    print("\n⏹ Interrupted by user – proceeding with what we have …")

print(f"\n✓ total matches: {len(hits)}")

# ---------- preview top 5 ---------------------------------------------------
for p in hits[:5]:
    ts = dt_parse.isoparse(p["created_at"]).strftime("%Y-%m-%d %H:%M")
    print(f"[{ts}] @{p['account']} → {p['text'][:120]}…")

# ---------- save CSV --------------------------------------------------------
ROOT = pathlib.Path.cwd()
while ROOT.parent != ROOT and not (ROOT / ".git").exists() and not (ROOT / ".env").exists():
    ROOT = ROOT.parent

out_path = ROOT / "outputs" / "trump_ru_uk_truths.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)

with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["created_at", "account", "id", "text"])
    writer.writeheader()
    writer.writerows(hits)

print("📄 CSV written →", out_path.relative_to(ROOT))

In [None]:
# ─────────── merge & dedup Truth-Social match files ────────────
import pandas as pd, hashlib, pathlib, textwrap

ROOT = pathlib.Path.cwd()
while ROOT.parent != ROOT and not (ROOT/".git").exists() and not (ROOT/".env").exists():
    ROOT = ROOT.parent

CSV_FILES = [
    ROOT / "outputs" / "trump_ru_uk_truths.csv",
    ROOT / "outputs" / "new_truth_scrape_matches.csv",
]

frames = []
for p in CSV_FILES:
    if p.exists():
        df = pd.read_csv(p, dtype=str)       # keep ids as str
        frames.append(df)
        print(f"✓ loaded {p.name:<30} rows={len(df):>5}")
    else:
        print(f"⚠️  {p} not found")

if not frames:
    raise FileNotFoundError("No CSVs found – check paths above.")

df_all = pd.concat(frames, ignore_index=True)

# -------- exact-ID dedup ----------------------------------------
before   = len(df_all)
df_all   = df_all.drop_duplicates(subset="id")

# -------- fallback fuzzy key  -----------------------------------
# Sometimes Truthbrush returns slightly different IDs for the same content.
# Build a cheap content hash (first 120 chars lower-cased, account-specific).
def row_key(r):
    snippet = (r["text"][:120] if isinstance(r["text"], str) else "").lower()
    return hashlib.md5((r["account"] + snippet).encode()).hexdigest()

df_all["dup_key"] = df_all.apply(row_key, axis=1)
df_all = df_all.drop_duplicates(subset="dup_key").drop(columns="dup_key")

after_exact  = before
after_final  = len(df_all)

print(f"""
rows in concat   : {before}
unique post IDs  : {after_exact}
unique after key : {after_final}
duplicates dropped: {before - after_final}
""".strip())

# save
out_path = ROOT / "outputs" / "truth_matches_merged.csv"
df_all.to_csv(out_path, index=False)
print("💾 merged CSV →", out_path.relative_to(ROOT))

# peek a few rows
print("\n▶ sample rows")
for _, r in df_all.head(5).iterrows():
    print(textwrap.shorten(r['text'].replace("\n"," "), width=100, placeholder="…"))


In [None]:
# ───── monthly conflict-keyword Truths (all accounts) ─────
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

csv_path = ROOT / "outputs" / "truth_matches_merged.csv"
df = pd.read_csv(csv_path, parse_dates=["created_at"])

if df.empty:
    print("⚠️  DataFrame is empty – check the CSV path or merge step.")
else:
    # bucket by month
    df["month"] = df["created_at"].dt.to_period("M").astype(str)
    counts = df.groupby("month").size()
    
    plt.figure(figsize=(10,4))
    counts.plot(kind="bar")
    plt.title("Conflict-related Truths per month")
    plt.ylabel("Post count")
    plt.xticks(rotation=70, ha="right")
    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ------------------------------------------------------------------- paths
vol_csv = ROOT / "outputs" / "truth_matches_merged.csv"
esc_csv = ROOT / "outputs" / "headline_scores_partial.csv"

# ------------------------------------------------------------------- volume
vol = (
    pd.read_csv(vol_csv, parse_dates=["created_at"])
      .assign(day=lambda d: d["created_at"].dt.date)
)

vol_daily = (
    vol.groupby("day")
       .size()
       .rename("post_count")
       .to_frame()
)

# ------------------------------------------------------------------- escalation (updated for new CSV format)
# Now the CSV has columns: 'date', 'source', 'title', 'score'
esc = pd.read_csv(esc_csv, parse_dates=["date"])
esc = esc.set_index("date")

# If there is already a 'roll7' column, use it; otherwise compute a 7-day rolling average of 'score'
if "roll7" in esc.columns:
    esc_7d = esc["roll7"].rename("escalation_7d")
else:
    esc_7d = (
        esc["score"]
            .rolling(7, min_periods=1)
            .mean()
            .rename("escalation_7d")
    )

# ------------------------------------------------------------------- merge & plot
df = vol_daily.join(esc_7d, how="outer").fillna(0)

fig, ax1 = plt.subplots(figsize=(11, 4))
ax1.bar(
    df.index,
    df["post_count"],
    width=1,
    color="#4a90e2",
    alpha=0.55,
    label="Trump RU/UA daily count"
)
ax1.set_ylabel("daily post count", color="#4a90e2")
ax1.tick_params(axis="y", labelcolor="#4a90e2")

ax2 = ax1.twinx()
ax2.plot(
    df.index,
    df["escalation_7d"],
    color="crimson",
    linewidth=2,
    label="7-day mean escalation"
)
ax2.set_ylabel("escalation index (0-10)", color="crimson")
ax2.tick_params(axis="y", labelcolor="crimson")

plt.title("Trump attention to RU/UA vs. conflict escalation, 2022-2025")

# Get handles & labels from each axis separately
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()

# Combine them into single lists
all_handles = handles1 + handles2
all_labels = labels1 + labels2

ax1.legend(all_handles, all_labels, loc="upper left")

plt.tight_layout()
plt.show()


In [None]:
# ╔══════════════════════════════════════════════════════════════════╗
# ║  Build GOLD-SET CANDIDATE sample for Truth-Social posts         ║
# ╚══════════════════════════════════════════════════════════════════╝
import pandas as pd, numpy as np
from pathlib import Path

ROOT      = Path.cwd().resolve().parents[0]
ALL_CSV   = ROOT / "outputs" / "truth_matches_merged.csv"
GOLD_CSV  = ROOT / "outputs" / "truth_gold_candidates.csv"

N_SAMPLES = 500                          # total sample size
TIME_BINS = 8                            # split full range into equal bins

# ── load & sanity check ─────────────────────────────────────────────
df = pd.read_csv(ALL_CSV, parse_dates=["created_at"])
if df.empty:
    raise RuntimeError("💥 merged Truth CSV seems empty – aborting")

# ── create time-bins (equal-length) ─────────────────────────────────
df = df.sort_values("created_at")
cut_edges = np.linspace(df["created_at"].min().value,
                        df["created_at"].max().value,
                        TIME_BINS + 1)
# assign a bin label 0 … TIME_BINS-1
df["time_bin"] = pd.cut(df["created_at"].view("int64"),
                        bins=cut_edges, labels=False, include_lowest=True)

# ── stratified sample: time_bin × account ──────────────────────────
samples = []
per_bin = int(np.ceil(N_SAMPLES / TIME_BINS))

for bin_id, grp in df.groupby("time_bin"):
    # target size in this bin
    n = min(per_bin, len(grp))
    # proportional by account count
    acc_counts = grp["account"].value_counts(normalize=True)
    wants = (acc_counts * n).round().astype(int)

    # adjust rounding drift
    while wants.sum() < n:
        wants.loc[wants.idxmax()] += 1
    while wants.sum() > n:
        wants.loc[wants.idxmax()] -= 1

    # sample inside each account slice
    for acc, k in wants.items():
        slice_ = grp[grp["account"] == acc]
        k = min(k, len(slice_))
        samples.append(slice_.sample(k, random_state=42))

gold_df = (pd.concat(samples)
             .sort_values("created_at")
             .reset_index(drop=True))

gold_df.to_csv(GOLD_CSV, index=False)
print(f"🎯 gold-set candidates written → {GOLD_CSV.relative_to(ROOT)}   "
      f"(rows={len(gold_df)})")

# quick peek
print("\n▶ random preview")
for _, r in gold_df.sample(5, random_state=1).iterrows():
    print(f"[{r.created_at:%Y-%m-%d}] @{r.account}: {r.text[:110]}…")


In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  FILTER Truth Social posts for Ukraine-Russia war relevance           ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, time, pandas as pd, tqdm, re
import anthropic
from dotenv import load_dotenv
import os
import requests
import logging

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0]
INPUT_CSV = ROOT / "outputs" / "truth_matches_merged.csv"
OUTPUT_CSV = ROOT / "outputs" / "truth_ukraine_filtered.csv"
MODEL = "claude-3-5-haiku-20241022"  # Cheap and fast for filtering

# Filtering prompt
FILTER_PROMPT = """You are filtering social media posts to identify which ones are about the Russia-Ukraine war.

A post is RELEVANT (score 1) if it:
- Directly mentions the Russia-Ukraine war, conflict, or invasion
- Discusses military actions between Russia and Ukraine
- Mentions Ukrainian or Russian leaders in context of the war
- Discusses weapons, aid, or sanctions related to the conflict
- References peace talks, negotiations, or ceasefire between Russia and Ukraine

A post is NOT RELEVANT (score 0) if it:
- Only mentions Russia or Ukraine in passing without war context
- Discusses other conflicts (Israel, Syria, etc.) without Ukraine connection
- Is about domestic US politics without Ukraine war connection
- Mentions "invasion" referring to immigration/borders, not Ukraine
- Uses war metaphors for non-Ukraine topics

Respond with ONLY a single digit: 1 for relevant, 0 for not relevant.

Examples:
"Putin is bombing Ukrainian cities again" → 1
"Biden's border invasion must stop" → 0
"We need to send more weapons to Ukraine" → 1
"Trump will end all wars including Ukraine" → 1
"China tariffs are destroying our economy" → 0"""

# Load data
df = pd.read_csv(INPUT_CSV, parse_dates=["created_at"])
print(f"📊 Processing {len(df)} Truth Social posts for Ukraine relevance")

# Initialize client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Quiet logging
for name in ("httpx", "anthropic"):
    logging.getLogger(name).setLevel(logging.WARNING)

# Add index for tracking
df["batch_idx"] = range(len(df))

# Prepare batch requests
requests_list = []
for idx, row in df.iterrows():
    if pd.isna(row.get("text")) or str(row["text"]).strip() == "":
        continue
        
    request = {
        "custom_id": str(row["batch_idx"]),
        "params": {
            "model": MODEL,
            "max_tokens": 5,
            "temperature": 0,
            "system": FILTER_PROMPT,
            "messages": [
                {"role": "user", "content": str(row["text"])[:1000]}
            ]
        }
    }
    requests_list.append(request)

print(f"📝 Prepared {len(requests_list)} requests for filtering")

# Create batch
batch = client.messages.batches.create(requests=requests_list)
print(f"🚀 Launched batch {batch.id}")

# Monitor progress
bar = tqdm.tqdm(total=len(requests_list), desc="Filtering", unit="post")
start_time = time.time()
while True:
    batch_status = client.messages.batches.retrieve(batch.id)
    completed = (batch_status.request_counts.succeeded + 
                batch_status.request_counts.errored + 
                batch_status.request_counts.canceled + 
                batch_status.request_counts.expired)
    bar.n = completed
    bar.refresh()
    
    if batch_status.processing_status == "ended":
        bar.close()
        break
    
    time.sleep(5)

elapsed_time = time.time() - start_time
print(f"✅ Filtering complete in {elapsed_time/60:.1f} minutes")

# Parse results
relevance_scores = {}
errors = []

# Retrieve results
batch_final = client.messages.batches.retrieve(batch.id)

if batch_final.results_url:
    print(f"📥 Fetching results from batch {batch.id}")
    
    headers = {
        "x-api-key": os.getenv("ANTHROPIC_API_KEY"),
        "anthropic-version": "2023-06-01"
    }
    
    response = requests.get(batch_final.results_url, headers=headers, stream=True)
    
    if response.status_code == 200:
        for line in response.iter_lines():
            if not line:
                continue
                
            try:
                result = json.loads(line)
                custom_id = result.get("custom_id")
                
                if custom_id is None:
                    continue
                
                idx = int(custom_id)
                
                if result.get("result", {}).get("type") != "succeeded":
                    errors.append(f"Request {custom_id} failed")
                    continue
                
                # Extract the response
                message_content = result["result"]["message"]["content"][0]["text"].strip()
                
                # Parse the score (should be just "0" or "1")
                if message_content in ["0", "1"]:
                    relevance_scores[idx] = int(message_content)
                else:
                    errors.append(f"Invalid response for {custom_id}: {message_content}")
                    
            except Exception as e:
                errors.append(f"Error parsing result: {e}")
                continue

# Map scores back to dataframe
df["is_ukraine_relevant"] = df["batch_idx"].map(relevance_scores)

# Filter to only relevant posts
df_relevant = df[df["is_ukraine_relevant"] == 1].copy()
df_irrelevant = df[df["is_ukraine_relevant"] == 0].copy()

# Save filtered results
df_relevant.drop(columns=["batch_idx", "is_ukraine_relevant"]).to_csv(OUTPUT_CSV, index=False)

# Summary statistics
print(f"\n📊 Filtering Results:")
print(f"   Total posts: {len(df)}")
print(f"   Ukraine-relevant: {len(df_relevant)} ({len(df_relevant)/len(df)*100:.1f}%)")
print(f"   Not relevant: {len(df_irrelevant)} ({len(df_irrelevant)/len(df)*100:.1f}%)")
print(f"   Failed to classify: {len(errors)}")

if errors:
    print(f"\n⚠️  Errors encountered: {len(errors)}")
    for error in errors[:5]:
        print(f"   - {error}")

# Show examples of filtered out posts
print("\n🔍 Examples of posts filtered OUT as not Ukraine-related:")
for _, row in df_irrelevant.head(5).iterrows():
    print(f"   - {row['text'][:100]}...")

print(f"\n💾 Filtered data saved to: {OUTPUT_CSV}")