In [None]:
# --- repo bootstrap ---------------------------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys
import subprocess, sys, importlib, os, re
from datetime import datetime
import truthbrush as tb
import sqlite3

def repo_root(start: Path) -> Path:
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / ".env").exists() or (cur / ".git").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("repo root not found")

ROOT = repo_root(Path.cwd())
load_dotenv(ROOT / ".env")             # loads secrets
sys.path.append(str(ROOT / "src"))     # optional helpers

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs"
FIG_DIR  = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True)

print("Repo root:", ROOT)

In [None]:
# ────────── UNIVERSAL PATCH CELL  (run once, very top of notebook) ──────────
import subprocess, sys, importlib, os, types
from pathlib import Path

# 1️⃣  make sure both python-dotenv and curl_cffi exist
def ensure(pkg, src=None):
    try:
        importlib.import_module(pkg)
    except ModuleNotFoundError:
        subprocess.check_call(
            [sys.executable, "-m", "pip", "install", "--quiet", src or pkg]
        )

ensure("python-dotenv")
ensure("curl_cffi")

# 2️⃣  reload .env (override=True guarantees fresh values)
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(usecwd=True), override=True)

# 3️⃣  import truthbrush and inject curl_cffi so NameError can’t happen
import curl_cffi                     # noqa:  F401  (needed for side-effect)
import truthbrush.api as tb_api
tb_api.curl_cffi = curl_cffi         # hand it to truthbrush’s module scope

import truthbrush as tb
print("✔ Patch cell finished – environment refreshed, curl_cffi wired\n")


In [None]:
import truthbrush as tb
from datetime import datetime, timezone
import random, time

# install SINGLE wrapper around _get  (skip if attribute exists)
if not hasattr(tb.api.Api, "_get_base"):
    tb.api.Api._get_base = tb.api.Api._get      # save original once

    def _polite_get(self, url, params=None):
        resp = self._get_base(url, params)
        # header-based sleep
        if (self.ratelimit_remaining is not None
                and self.ratelimit_remaining <= 10
                and self.ratelimit_reset):
            wait = max(
                0,
                (self.ratelimit_reset -
                 datetime.utcnow().replace(tzinfo=timezone.utc)).total_seconds()
            ) + random.uniform(1, 3)
            print(f"📉 near limit – sleeping {wait:.1f}s")
            time.sleep(wait)
        else:
            time.sleep(random.uniform(1.5, 3.0))
        return resp

    tb.api.Api._get = _polite_get
    print("✓ polite-delay wrapper installed")
else:
    print("✓ wrapper already present – no re-patch")


In [None]:
TOKEN = "IhoaRRTm-gkV7PyfH70sIofN-pf24NnAIL8fabmc6Sg"   # put your live token
api   = tb.Api(token=TOKEN)
api.auth_id = api.auth_id or ""
print("client ready")

print("lookup test:")
try:
    print(api.lookup("realDonaldTrump")["id"][:8], "… lookup OK")
except Exception as e:
    print("lookup failed:", e)


In [None]:
# ───────────────── scrape Truth Social & save CSV ──────────────────────────
import os, re, csv, pathlib, random, time, json, sys
from datetime import datetime, timezone
from dateutil import parser as dt_parse
from tqdm.auto import tqdm

# ---------- config ---------------------------------------------------------
KEYWORDS = [
    "russia", "russian", "ukraine", "ukrainian", "ru-uk", "putin",
    "zelensky", "zelenskyy", "kremlin", "kyiv", "crimea", "donbas",
    "mariupol", "kherson", "luhansk", "dnipro", "odessa", "invasion", "war",
]
SEED_HANDLES = [
    "realDonaldTrump", "TeamTrump", "TrumpWarRoom", "WhiteHouse", "PressSec"
]

def keyword_hit(html: str) -> bool:
    return any(k in html.lower() for k in KEYWORDS)

# ---------- resolve handles ------------------------------------------------
def canonical_handle(hint: str) -> str | None:
    try:
        info = api.lookup(user_handle=hint.lstrip("@"))
        return info.get("acct")
    except Exception:
        try:
            page = next(api.search("accounts", hint, limit=1))
            return page["accounts"][0]["acct"] if page["accounts"] else None
        except Exception:
            return None

handles = [h for h in (canonical_handle(x) for x in SEED_HANDLES) if h]
print("Scanning:", handles)

# ---------- scrape with live progress --------------------------------------
hits = []
try:
    for h in handles:
        print(f"\n↳ pulling @{h}")

        # get a generator *but* pull the first post immediately so you know it's alive
        gen = api.pull_statuses(username=h, replies=False, verbose=False)
        print("  … sending first request")
        try:
            first_post = next(gen)
            print("  ✓ first post received")
        except StopIteration:
            print("  ⚠️ no posts found for this account")
            continue

        # prepend that first item back into the stream
        def prepend_first(item, iterator):
            yield item
            yield from iterator
        gen = prepend_first(first_post, gen)

        pbar = tqdm(gen, unit="post", desc=f"{h}", leave=True)
        matched = 0
        for post in pbar:
            if post and post.get("content") and keyword_hit(post["content"]):
                matched += 1
                hits.append(
                    {
                        "created_at": post["created_at"],
                        "account"   : h,
                        "id"        : post["id"],
                        "text"      : re.sub(r"<[^>]+>", "", post["content"]).strip(),
                    }
                )
            if matched % 25 == 0:             # update label every 25 matches
                pbar.set_description(f"{h}  hits:{matched}")
        pbar.close()
        print(f"✓ @{h}: {matched} matches collected")

except KeyboardInterrupt:
    print("\n⏹ Interrupted by user – proceeding with what we have …")

print(f"\n✓ total matches: {len(hits)}")

# ---------- preview top 5 ---------------------------------------------------
for p in hits[:5]:
    ts = dt_parse.isoparse(p["created_at"]).strftime("%Y-%m-%d %H:%M")
    print(f"[{ts}] @{p['account']} → {p['text'][:120]}…")

# ---------- save CSV --------------------------------------------------------
ROOT = pathlib.Path.cwd()
while ROOT.parent != ROOT and not (ROOT / ".git").exists() and not (ROOT / ".env").exists():
    ROOT = ROOT.parent

out_path = ROOT / "outputs" / "trump_ru_uk_truths.csv"
out_path.parent.mkdir(parents=True, exist_ok=True)

with out_path.open("w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["created_at", "account", "id", "text"])
    writer.writeheader()
    writer.writerows(hits)

print("📄 CSV written →", out_path.relative_to(ROOT))

In [None]:
# ─────────── merge & dedup Truth-Social match files ────────────
import pandas as pd, hashlib, pathlib, textwrap

ROOT = pathlib.Path.cwd()
while ROOT.parent != ROOT and not (ROOT/".git").exists() and not (ROOT/".env").exists():
    ROOT = ROOT.parent

CSV_FILES = [
    ROOT / "outputs" / "trump_ru_uk_truths.csv",
    ROOT / "outputs" / "new_truth_scrape_matches.csv",
]

frames = []
for p in CSV_FILES:
    if p.exists():
        df = pd.read_csv(p, dtype=str)       # keep ids as str
        frames.append(df)
        print(f"✓ loaded {p.name:<30} rows={len(df):>5}")
    else:
        print(f"⚠️  {p} not found")

if not frames:
    raise FileNotFoundError("No CSVs found – check paths above.")

df_all = pd.concat(frames, ignore_index=True)

# -------- exact-ID dedup ----------------------------------------
before   = len(df_all)
df_all   = df_all.drop_duplicates(subset="id")

# -------- fallback fuzzy key  -----------------------------------
# Sometimes Truthbrush returns slightly different IDs for the same content.
# Build a cheap content hash (first 120 chars lower-cased, account-specific).
def row_key(r):
    snippet = (r["text"][:120] if isinstance(r["text"], str) else "").lower()
    return hashlib.md5((r["account"] + snippet).encode()).hexdigest()

df_all["dup_key"] = df_all.apply(row_key, axis=1)
df_all = df_all.drop_duplicates(subset="dup_key").drop(columns="dup_key")

after_exact  = before
after_final  = len(df_all)

print(f"""
rows in concat   : {before}
unique post IDs  : {after_exact}
unique after key : {after_final}
duplicates dropped: {before - after_final}
""".strip())

# save
out_path = ROOT / "outputs" / "truth_matches_merged.csv"
df_all.to_csv(out_path, index=False)
print("💾 merged CSV →", out_path.relative_to(ROOT))

# peek a few rows
print("\n▶ sample rows")
for _, r in df_all.head(5).iterrows():
    print(textwrap.shorten(r['text'].replace("\n"," "), width=100, placeholder="…"))


In [None]:
# ───── monthly conflict-keyword Truths (all accounts) ─────
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

csv_path = ROOT / "outputs" / "truth_matches_merged.csv"
df = pd.read_csv(csv_path, parse_dates=["created_at"])

if df.empty:
    print("⚠️  DataFrame is empty – check the CSV path or merge step.")
else:
    # bucket by month
    df["month"] = df["created_at"].dt.to_period("M").astype(str)
    counts = df.groupby("month").size()
    
    plt.figure(figsize=(10,4))
    counts.plot(kind="bar")
    plt.title("Conflict-related Truths per month")
    plt.ylabel("Post count")
    plt.xticks(rotation=70, ha="right")
    plt.tight_layout()
    plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# ------------------------------------------------------------------- paths
vol_csv = ROOT / "outputs" / "truth_matches_merged.csv"
esc_csv = ROOT / "outputs" / "headline_scores_partial.csv"

# ------------------------------------------------------------------- volume
vol = (
    pd.read_csv(vol_csv, parse_dates=["created_at"])
      .assign(day=lambda d: d["created_at"].dt.date)
)

vol_daily = (
    vol.groupby("day")
       .size()
       .rename("post_count")
       .to_frame()
)

# ------------------------------------------------------------------- escalation (updated for new CSV format)
# Now the CSV has columns: 'date', 'source', 'title', 'score'
esc = pd.read_csv(esc_csv, parse_dates=["date"])
esc = esc.set_index("date")

# If there is already a 'roll7' column, use it; otherwise compute a 7-day rolling average of 'score'
if "roll7" in esc.columns:
    esc_7d = esc["roll7"].rename("escalation_7d")
else:
    esc_7d = (
        esc["score"]
            .rolling(7, min_periods=1)
            .mean()
            .rename("escalation_7d")
    )

# ------------------------------------------------------------------- merge & plot
df = vol_daily.join(esc_7d, how="outer").fillna(0)

fig, ax1 = plt.subplots(figsize=(11, 4))
ax1.bar(
    df.index,
    df["post_count"],
    width=1,
    color="#4a90e2",
    alpha=0.55,
    label="Trump RU/UA daily count"
)
ax1.set_ylabel("daily post count", color="#4a90e2")
ax1.tick_params(axis="y", labelcolor="#4a90e2")

ax2 = ax1.twinx()
ax2.plot(
    df.index,
    df["escalation_7d"],
    color="crimson",
    linewidth=2,
    label="7-day mean escalation"
)
ax2.set_ylabel("escalation index (0-10)", color="crimson")
ax2.tick_params(axis="y", labelcolor="crimson")

plt.title("Trump attention to RU/UA vs. conflict escalation, 2022-2025")

# Get handles & labels from each axis separately
handles1, labels1 = ax1.get_legend_handles_labels()
handles2, labels2 = ax2.get_legend_handles_labels()

# Combine them into single lists
all_handles = handles1 + handles2
all_labels = labels1 + labels2

ax1.legend(all_handles, all_labels, loc="upper left")

plt.tight_layout()
plt.show()


In [None]:
# ╔══════════════════════════════════════════════════════════════════╗
# ║  Build GOLD-SET CANDIDATE sample for Truth-Social posts         ║
# ╚══════════════════════════════════════════════════════════════════╝
import pandas as pd, numpy as np
from pathlib import Path

ROOT      = Path.cwd().resolve().parents[0]
ALL_CSV   = ROOT / "outputs" / "truth_matches_merged.csv"
GOLD_CSV  = ROOT / "outputs" / "truth_gold_candidates.csv"

N_SAMPLES = 500                          # total sample size
TIME_BINS = 8                            # split full range into equal bins

# ── load & sanity check ─────────────────────────────────────────────
df = pd.read_csv(ALL_CSV, parse_dates=["created_at"])
if df.empty:
    raise RuntimeError("💥 merged Truth CSV seems empty – aborting")

# ── create time-bins (equal-length) ─────────────────────────────────
df = df.sort_values("created_at")
cut_edges = np.linspace(df["created_at"].min().value,
                        df["created_at"].max().value,
                        TIME_BINS + 1)
# assign a bin label 0 … TIME_BINS-1
df["time_bin"] = pd.cut(df["created_at"].view("int64"),
                        bins=cut_edges, labels=False, include_lowest=True)

# ── stratified sample: time_bin × account ──────────────────────────
samples = []
per_bin = int(np.ceil(N_SAMPLES / TIME_BINS))

for bin_id, grp in df.groupby("time_bin"):
    # target size in this bin
    n = min(per_bin, len(grp))
    # proportional by account count
    acc_counts = grp["account"].value_counts(normalize=True)
    wants = (acc_counts * n).round().astype(int)

    # adjust rounding drift
    while wants.sum() < n:
        wants.loc[wants.idxmax()] += 1
    while wants.sum() > n:
        wants.loc[wants.idxmax()] -= 1

    # sample inside each account slice
    for acc, k in wants.items():
        slice_ = grp[grp["account"] == acc]
        k = min(k, len(slice_))
        samples.append(slice_.sample(k, random_state=42))

gold_df = (pd.concat(samples)
             .sort_values("created_at")
             .reset_index(drop=True))

gold_df.to_csv(GOLD_CSV, index=False)
print(f"🎯 gold-set candidates written → {GOLD_CSV.relative_to(ROOT)}   "
      f"(rows={len(gold_df)})")

# quick peek
print("\n▶ random preview")
for _, r in gold_df.sample(5, random_state=1).iterrows():
    print(f"[{r.created_at:%Y-%m-%d}] @{r.account}: {r.text[:110]}…")


In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  FILTER Truth Social posts for Ukraine-Russia war relevance           ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, time, pandas as pd, tqdm, re
import anthropic
from dotenv import load_dotenv
import os
import requests
import logging

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0]
INPUT_CSV = ROOT / "outputs" / "truth_matches_merged.csv"
OUTPUT_CSV = ROOT / "outputs" / "truth_ukraine_filtered.csv"
MODEL = "claude-3-5-haiku-20241022"  # Cheap and fast for filtering

# Filtering prompt
FILTER_PROMPT = """You are filtering social media posts to identify which ones are about the Russia-Ukraine war.

A post is RELEVANT (score 1) if it:
- Directly mentions the Russia-Ukraine war, conflict, or invasion
- Discusses military actions between Russia and Ukraine
- Mentions Ukrainian or Russian leaders in context of the war
- Discusses weapons, aid, or sanctions related to the conflict
- References peace talks, negotiations, or ceasefire between Russia and Ukraine

A post is NOT RELEVANT (score 0) if it:
- Only mentions Russia or Ukraine in passing without war context
- Discusses other conflicts (Israel, Syria, etc.) without Ukraine connection
- Is about domestic US politics without Ukraine war connection
- Mentions "invasion" referring to immigration/borders, not Ukraine
- Uses war metaphors for non-Ukraine topics

Respond with ONLY a single digit: 1 for relevant, 0 for not relevant.

Examples:
"Putin is bombing Ukrainian cities again" → 1
"Biden's border invasion must stop" → 0
"We need to send more weapons to Ukraine" → 1
"Trump will end all wars including Ukraine" → 1
"China tariffs are destroying our economy" → 0"""

# Load data
df = pd.read_csv(INPUT_CSV, parse_dates=["created_at"])
print(f"📊 Processing {len(df)} Truth Social posts for Ukraine relevance")

# Initialize client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Quiet logging
for name in ("httpx", "anthropic"):
    logging.getLogger(name).setLevel(logging.WARNING)

# Add index for tracking
df["batch_idx"] = range(len(df))

# Prepare batch requests
requests_list = []
for idx, row in df.iterrows():
    if pd.isna(row.get("text")) or str(row["text"]).strip() == "":
        continue
        
    request = {
        "custom_id": str(row["batch_idx"]),
        "params": {
            "model": MODEL,
            "max_tokens": 5,
            "temperature": 0,
            "system": FILTER_PROMPT,
            "messages": [
                {"role": "user", "content": str(row["text"])[:1000]}
            ]
        }
    }
    requests_list.append(request)

print(f"📝 Prepared {len(requests_list)} requests for filtering")

# Create batch
batch = client.messages.batches.create(requests=requests_list)
print(f"🚀 Launched batch {batch.id}")

# Monitor progress
bar = tqdm.tqdm(total=len(requests_list), desc="Filtering", unit="post")
start_time = time.time()
while True:
    batch_status = client.messages.batches.retrieve(batch.id)
    completed = (batch_status.request_counts.succeeded + 
                batch_status.request_counts.errored + 
                batch_status.request_counts.canceled + 
                batch_status.request_counts.expired)
    bar.n = completed
    bar.refresh()
    
    if batch_status.processing_status == "ended":
        bar.close()
        break
    
    time.sleep(5)

elapsed_time = time.time() - start_time
print(f"✅ Filtering complete in {elapsed_time/60:.1f} minutes")

# Parse results
relevance_scores = {}
errors = []

# Retrieve results
batch_final = client.messages.batches.retrieve(batch.id)

if batch_final.results_url:
    print(f"📥 Fetching results from batch {batch.id}")
    
    headers = {
        "x-api-key": os.getenv("ANTHROPIC_API_KEY"),
        "anthropic-version": "2023-06-01"
    }
    
    response = requests.get(batch_final.results_url, headers=headers, stream=True)
    
    if response.status_code == 200:
        for line in response.iter_lines():
            if not line:
                continue
                
            try:
                result = json.loads(line)
                custom_id = result.get("custom_id")
                
                if custom_id is None:
                    continue
                
                idx = int(custom_id)
                
                if result.get("result", {}).get("type") != "succeeded":
                    errors.append(f"Request {custom_id} failed")
                    continue
                
                # Extract the response
                message_content = result["result"]["message"]["content"][0]["text"].strip()
                
                # Parse the score (should be just "0" or "1")
                if message_content in ["0", "1"]:
                    relevance_scores[idx] = int(message_content)
                else:
                    errors.append(f"Invalid response for {custom_id}: {message_content}")
                    
            except Exception as e:
                errors.append(f"Error parsing result: {e}")
                continue

# Map scores back to dataframe
df["is_ukraine_relevant"] = df["batch_idx"].map(relevance_scores)

# Filter to only relevant posts
df_relevant = df[df["is_ukraine_relevant"] == 1].copy()
df_irrelevant = df[df["is_ukraine_relevant"] == 0].copy()

# Save filtered results
df_relevant.drop(columns=["batch_idx", "is_ukraine_relevant"]).to_csv(OUTPUT_CSV, index=False)

# Summary statistics
print(f"\n📊 Filtering Results:")
print(f"   Total posts: {len(df)}")
print(f"   Ukraine-relevant: {len(df_relevant)} ({len(df_relevant)/len(df)*100:.1f}%)")
print(f"   Not relevant: {len(df_irrelevant)} ({len(df_irrelevant)/len(df)*100:.1f}%)")
print(f"   Failed to classify: {len(errors)}")

if errors:
    print(f"\n⚠️  Errors encountered: {len(errors)}")
    for error in errors[:5]:
        print(f"   - {error}")

# Show examples of filtered out posts
print("\n🔍 Examples of posts filtered OUT as not Ukraine-related:")
for _, row in df_irrelevant.head(5).iterrows():
    print(f"   - {row['text'][:100]}...")

print(f"\n💾 Filtered data saved to: {OUTPUT_CSV}")

## First Pass New Scraper

In [None]:
# ───────────────── Comprehensive Truth Social Ukraine/Russia Scraper ────────
import os, re, csv, pathlib, random, time, json, sys
from datetime import datetime, timezone, timedelta
from dateutil import parser as dt_parse
from tqdm.auto import tqdm
import sqlite3
from collections import deque

# Use the already instantiated api object from your previous cell

# ---------- Date Range Configuration ---------------------------------------
# Russia invaded Ukraine on Feb 24, 2022. Start a few days before
START_DATE = datetime(2022, 2, 20, 0, 0, 0, tzinfo=timezone.utc)
END_DATE = datetime(2025, 6, 5, 23, 59, 59, tzinfo=timezone.utc)

print(f"Collection period: {START_DATE.date()} to {END_DATE.date()}")
print(f"Total days: {(END_DATE - START_DATE).days}")

# ---------- Enhanced Keywords Configuration --------------------------------
KEYWORDS = [
    # Core terms
    "russia", "russian", "russians", "россия", "русский",
    "ukraine", "ukrainian", "ukrainians", "україна", "український", 
    "putin", "путин", "zelensky", "zelenskyy", "зеленський",
    
    # Conflict terms
    "invasion", "war", "conflict", "attack", "offensive", "defensive",
    "sanctions", "embargo", "military", "troops", "soldiers", "army",
    
    # Geographic locations
    "kremlin", "kyiv", "kiev", "moscow", "crimea", "донбас", "donbas", 
    "donetsk", "luhansk", "lugansk", "mariupol", "kherson", "херсон",
    "zaporizhzhia", "odessa", "одесса", "dnipro", "kharkiv", "харьков",
    "bakhmut", "severodonetsk", "melitopol", "berdyansk",
    
    # Military/political terms
    "nato", "нато", "wagner", "вагнер", "azov", "азов", "referendum",
    "annexation", "liberation", "occupation", "peacekeeping", "ceasefire",
    "himars", "javelin", "patriot", "abrams", "leopard", "f16",
    
    # Narrative indicators
    "denazification", "demilitarization", "special operation", 
    "специальная операция", "smo", "proxy war", "biolabs", "bioweapons",
    
    # Support indicators
    "standwithukraine", "standwithrussia", "slavaukraini", "слава",
    "istandwithputin", "istandwithzelensky", "stopthewar", "nowar",
    
    # Economic terms
    "gas", "oil", "grain", "ruble", "hryvnia", "swift", "nordstream",
    "gazprom", "rosneft", "grain deal", "black sea"
]

# ---------- VERIFIED Truth Social Accounts ---------------------------------
# Based on research, these accounts actually exist and are active
VERIFIED_ACCOUNTS = [
    # Core political figures (confirmed active)
    "realDonaldTrump",      # 3.8M followers
    "DevinNunes",           # CEO of Truth Social
    
    # Politicians known to be on Truth Social
    "DanScavino",           # Trump's deputy chief of staff
    "KariLake",             # AZ gubernatorial candidate
    "JDVance1",             # Ohio Senator (check exact handle)
    "MarcoPolo",            # Marco Rubio (verify handle)
    "LeadRight",            # Confirmed Truth Social account
    
    # Media personalities (research shows these exist)
    "DonaldJTrumpJr",       # Confirmed account
    "EricTrump",            # Confirmed account
    "TuckerCarlson",        # If he has account
    "DineshDSouza",         # Conservative commentator
    
    # News organizations
    "BreitbartNews",        # Confirmed to have bot account
    "RSBNetwork",           # Right Side Broadcasting
    "OANN",                 # One America News (if exists)
    
    # Other verified accounts that discuss geopolitics
    "truthsupport",         # Official Truth Social account
    "BidenHQ",              # Biden campaign trolling account (confirmed)
]

# Additional accounts to try (may or may not exist)
POSSIBLE_ACCOUNTS = [
    "Warroom", "WarRoomPandemic", "RealGenFlynn", "SebGorka",
    "CharlieKirk", "JackPosobiec", "CandaceOwens", "LauraLoomer",
    "MattGaetz", "MTG", "MarjorieTaylorGreene", "RandPaul",
    "TedCruz", "JoshHawley", "RonDeSantis", "TulsiGabbard"
]

# ---------- Database Setup with Date Indexing ------------------------------
def setup_database():
    db_path = pathlib.Path("truth_social_ukraine_data.db")
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS posts (
            id TEXT PRIMARY KEY,
            created_at TEXT,
            created_date TEXT,  -- For easier date queries
            account TEXT,
            account_id TEXT,
            text TEXT,
            url TEXT,
            in_reply_to_id TEXT,
            reblog_of_id TEXT,
            favourites_count INTEGER,
            reblogs_count INTEGER,
            replies_count INTEGER,
            language TEXT,
            visibility TEXT,
            collection_method TEXT,
            collected_at TEXT,
            keywords_matched TEXT
        )
    ''')
    
    # Create index for date-based queries
    cursor.execute('''
        CREATE INDEX IF NOT EXISTS idx_created_date ON posts(created_date)
    ''')
    
    cursor.execute('''
        CREATE INDEX IF NOT EXISTS idx_account ON posts(account)
    ''')
    
    conn.commit()
    return conn

# ---------- Helper Functions -----------------------------------------------
def is_within_date_range(post_date_str):
    """Check if post is within our date range"""
    try:
        post_date = dt_parse.isoparse(post_date_str)
        if post_date.tzinfo is None:
            post_date = post_date.replace(tzinfo=timezone.utc)
        return START_DATE <= post_date <= END_DATE
    except:
        return False

def keyword_hit(text: str) -> list:
    """Return list of matched keywords"""
    if not text:
        return []
    text_lower = text.lower()
    matched = [k for k in KEYWORDS if k.lower() in text_lower]
    return matched

def clean_html(html: str) -> str:
    """Remove HTML tags and decode entities"""
    if not html:
        return ""
    text = re.sub(r'<[^>]+>', ' ', html)
    text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    text = text.replace('&quot;', '"').replace('&#39;', "'")
    text = ' '.join(text.split())
    return text.strip()

def process_post(post, collection_method, conn):
    """Process and store a single post if within date range"""
    if not post or not isinstance(post, dict):
        return False
    
    # Check date range first
    created_at = post.get('created_at', '')
    if not is_within_date_range(created_at):
        return False
        
    text = clean_html(post.get('content', ''))
    matched_keywords = keyword_hit(text)
    
    if not matched_keywords:
        return False
    
    cursor = conn.cursor()
    account = post.get('account', {})
    
    # Extract date for easier querying
    created_date = created_at[:10] if created_at else None
    
    data = {
        'id': post.get('id'),
        'created_at': created_at,
        'created_date': created_date,
        'account': account.get('acct', 'unknown'),
        'account_id': account.get('id', 'unknown'),
        'text': text,
        'url': post.get('url', ''),
        'in_reply_to_id': post.get('in_reply_to_id'),
        'reblog_of_id': post.get('reblog', {}).get('id') if post.get('reblog') else None,
        'favourites_count': post.get('favourites_count', 0),
        'reblogs_count': post.get('reblogs_count', 0),
        'replies_count': post.get('replies_count', 0),
        'language': post.get('language'),
        'visibility': post.get('visibility'),
        'collection_method': collection_method,
        'collected_at': datetime.now(timezone.utc).isoformat(),
        'keywords_matched': ','.join(matched_keywords[:10])  # Limit to first 10
    }
    
    cursor.execute('''
        INSERT OR IGNORE INTO posts VALUES (
            :id, :created_at, :created_date, :account, :account_id, :text, :url,
            :in_reply_to_id, :reblog_of_id, :favourites_count,
            :reblogs_count, :replies_count, :language, :visibility,
            :collection_method, :collected_at, :keywords_matched
        )
    ''', data)
    
    conn.commit()
    return cursor.rowcount > 0

# ---------- Verify Account Exists ------------------------------------------
def verify_account(handle):
    """Check if account exists on Truth Social"""
    try:
        user_info = api.lookup(user_handle=handle.lstrip("@"))
        if user_info and user_info.get('acct'):
            return user_info.get('acct')
    except:
        pass
    
    # Try search as fallback
    try:
        results = list(api.search("accounts", handle, limit=1))
        if results and results[0].get('accounts'):
            return results[0]['accounts'][0].get('acct')
    except:
        pass
    
    return None

# ---------- Collection Functions -------------------------------------------
def collect_via_search_deep(query, conn):
    """Deep search with pagination through entire date range"""
    collected = 0
    oldest_date_seen = END_DATE
    newest_date_seen = START_DATE
    
    try:
        print(f"\n🔍 Deep searching: '{query}'")
        
        # Use Truthbrush search with high limit
        search_gen = api.search("statuses", query, limit=40)
        
        with tqdm(desc=f"Search: {query}", unit="posts") as pbar:
            page_count = 0
            consecutive_old_posts = 0
            
            for page in search_gen:
                if not page or 'statuses' not in page:
                    break
                    
                statuses = page.get('statuses', [])
                if not statuses:
                    break
                
                page_count += 1
                posts_in_range = 0
                
                for post in statuses:
                    post_date_str = post.get('created_at', '')
                    
                    # Track date range
                    try:
                        post_date = dt_parse.isoparse(post_date_str)
                        if post_date.tzinfo is None:
                            post_date = post_date.replace(tzinfo=timezone.utc)
                            
                        if post_date < oldest_date_seen:
                            oldest_date_seen = post_date
                        if post_date > newest_date_seen:
                            newest_date_seen = post_date
                            
                        # If post is before our start date, increment counter
                        if post_date < START_DATE:
                            consecutive_old_posts += 1
                        else:
                            consecutive_old_posts = 0
                            
                    except:
                        pass
                    
                    # Process post
                    if process_post(post, f"search:{query}", conn):
                        collected += 1
                        posts_in_range += 1
                        pbar.update(1)
                
                # Update progress with date info
                if page_count % 5 == 0:
                    pbar.set_description(
                        f"Search: {query} [{oldest_date_seen.strftime('%Y-%m-%d')} "
                        f"to {newest_date_seen.strftime('%Y-%m-%d')}]"
                    )
                
                # Stop if we've gone too far back in time
                if consecutive_old_posts > 100:
                    print(f"    Reached posts before {START_DATE.date()}, stopping")
                    break
                
                # Rate limiting
                time.sleep(random.uniform(1, 3))
                
                # Stop after many pages to avoid getting blocked
                if page_count >= 50:
                    print(f"    Reached page limit (50)")
                    break
                    
    except Exception as e:
        print(f"  Search error: {str(e)[:100]}...")
        
    print(f"  ✓ Collected {collected} posts spanning "
          f"{oldest_date_seen.strftime('%Y-%m-%d')} to "
          f"{newest_date_seen.strftime('%Y-%m-%d')}")
    
    return collected

def collect_from_accounts_comprehensive(account_list, conn):
    """Collect ALL posts within date range from accounts"""
    total_collected = 0
    
    # First, verify which accounts actually exist
    print("\n🔍 Verifying accounts...")
    valid_accounts = []
    
    for handle in account_list:
        verified = verify_account(handle)
        if verified:
            valid_accounts.append(verified)
            print(f"  ✓ {handle} → @{verified}")
        else:
            print(f"  ✗ {handle} not found")
        time.sleep(random.uniform(0.5, 1))
    
    print(f"\nFound {len(valid_accounts)} valid accounts")
    
    # Collect from each valid account
    for idx, handle in enumerate(valid_accounts, 1):
        try:
            print(f"\n[{idx}/{len(valid_accounts)}] Collecting from @{handle}")
            
            # Pull ALL statuses from account
            gen = api.pull_statuses(username=handle, replies=False, verbose=False)
            
            collected = 0
            posts_checked = 0
            oldest_date = END_DATE
            newest_date = START_DATE
            posts_before_range = 0
            
            with tqdm(desc=f"@{handle}", unit="posts") as pbar:
                for post in gen:
                    posts_checked += 1
                    
                    # Check if we've gone before our date range
                    post_date_str = post.get('created_at', '')
                    try:
                        post_date = dt_parse.isoparse(post_date_str)
                        if post_date.tzinfo is None:
                            post_date = post_date.replace(tzinfo=timezone.utc)
                            
                        if post_date < oldest_date:
                            oldest_date = post_date
                        if post_date > newest_date:
                            newest_date = post_date
                            
                        if post_date < START_DATE:
                            posts_before_range += 1
                            # If we've seen 50 posts before our range, stop
                            if posts_before_range > 50:
                                print(f"    Reached posts before {START_DATE.date()}")
                                break
                    except:
                        pass
                    
                    # Process post
                    if process_post(post, f"account:{handle}", conn):
                        collected += 1
                        total_collected += 1
                        pbar.update(1)
                    
                    # Update progress
                    if posts_checked % 100 == 0:
                        pbar.set_description(
                            f"@{handle} [checked:{posts_checked} collected:{collected}]"
                        )
                    
                    # Brief pause every 100 posts
                    if posts_checked % 100 == 0:
                        time.sleep(random.uniform(1, 2))
                    
                    # Stop if we've checked too many (to avoid blocks)
                    if posts_checked >= 2000:
                        print(f"    Reached post limit (2000)")
                        break
            
            print(f"  ✓ Collected {collected} matching posts from {posts_checked} checked")
            print(f"    Date range: {oldest_date.strftime('%Y-%m-%d')} to {newest_date.strftime('%Y-%m-%d')}")
            
            # Longer pause between accounts
            time.sleep(random.uniform(5, 10))
            
        except Exception as e:
            print(f"  ⚠️ Error with @{handle}: {str(e)[:100]}...")
            continue
            
    return total_collected

# ---------- Main Collection Function ---------------------------------------
def run_comprehensive_collection():
    """Run comprehensive collection for entire war period"""
    conn = setup_database()
    total_collected = 0
    start_time = time.time()
    
    print(f"\n{'='*60}")
    print(f"Truth Social Ukraine/Russia Comprehensive Collection")
    print(f"{'='*60}")
    print(f"Date range: {START_DATE.date()} to {END_DATE.date()}")
    print(f"Keywords: {len(KEYWORDS)}")
    print(f"Target accounts: {len(VERIFIED_ACCOUNTS) + len(POSSIBLE_ACCOUNTS)}")
    print(f"{'='*60}\n")
    
    try:
        # Phase 1: Deep search across all keywords
        print("📡 PHASE 1: Deep historical search")
        print("Searching through entire war timeline...\n")
        
        search_queries = [
            # Primary searches
            "ukraine", "russia", "putin", "zelensky", "war ukraine",
            "ukraine invasion", "special operation", "ukraine russia",
            
            # Geopolitical searches
            "nato ukraine", "ukraine aid", "sanctions russia", "grain deal",
            "nordstream", "crimea", "donbas", "mariupol",
            
            # Narrative searches
            "denazification", "biolabs ukraine", "proxy war ukraine",
            "ukraine corruption", "zelensky dictator", "putin war"
        ]
        
        search_collected = 0
        for idx, query in enumerate(search_queries, 1):
            print(f"[{idx}/{len(search_queries)}] Query: '{query}'")
            collected = collect_via_search_deep(query, conn)
            search_collected += collected
            total_collected += collected
            
            # Show running stats
            elapsed = (time.time() - start_time) / 60
            rate = total_collected / elapsed if elapsed > 0 else 0
            print(f"  📊 Running total: {total_collected} posts "
                  f"({elapsed:.1f} min, {rate:.1f} posts/min)\n")
            
            # Pause between searches
            if idx < len(search_queries):
                sleep_time = random.uniform(10, 20)
                print(f"  💤 Pausing {sleep_time:.0f}s...\n")
                time.sleep(sleep_time)
        
        print(f"\n✓ Phase 1 complete: {search_collected} posts from searches")
        
        # Phase 2: Comprehensive account collection
        print("\n📡 PHASE 2: Account-based collection")
        print("Collecting from verified accounts...\n")
        
        # Combine and deduplicate accounts
        all_accounts = list(set(VERIFIED_ACCOUNTS + POSSIBLE_ACCOUNTS))
        
        account_collected = collect_from_accounts_comprehensive(all_accounts, conn)
        total_collected += account_collected
        
        print(f"\n✓ Phase 2 complete: {account_collected} posts from accounts")
        
        # Phase 3: Timeline scanning (brief)
        print("\n📡 PHASE 3: Recent timeline scan")
        
        try:
            print("Scanning recent public posts...")
            timeline_gen = api.trending()  # Get trending posts
            timeline_collected = 0
            
            for post in timeline_gen:
                if isinstance(post, dict) and 'content' in post:
                    if process_post(post, "trending", conn):
                        timeline_collected += 1
                        total_collected += 1
                        
                if timeline_collected >= 100:  # Just get recent ones
                    break
                    
            print(f"✓ Phase 3 complete: {timeline_collected} trending posts")
            
        except Exception as e:
            print(f"Timeline error: {str(e)[:100]}...")
        
    except KeyboardInterrupt:
        print("\n\n⏹ Collection interrupted - saving data...")
        
    finally:
        # Final statistics
        elapsed_total = (time.time() - start_time) / 60
        
        print(f"\n{'='*60}")
        print(f"COLLECTION COMPLETE")
        print(f"{'='*60}")
        print(f"Total posts collected: {total_collected:,}")
        print(f"Time taken: {elapsed_total:.1f} minutes ({elapsed_total/60:.1f} hours)")
        print(f"Average rate: {total_collected/elapsed_total:.1f} posts/minute")
        
        # Analyze date coverage
        cursor = conn.cursor()
        cursor.execute('''
            SELECT 
                MIN(created_at) as earliest,
                MAX(created_at) as latest,
                COUNT(DISTINCT created_date) as unique_days,
                COUNT(DISTINCT account) as unique_accounts,
                COUNT(*) as total_posts
            FROM posts
        ''')
        
        result = cursor.fetchone()
        if result and result[0]:
            print(f"\nDate coverage:")
            print(f"  Earliest post: {result[0][:10]}")
            print(f"  Latest post: {result[1][:10]}")
            print(f"  Unique days: {result[2]}")
            print(f"  Unique accounts: {result[3]}")
            
            # Posts per month
            cursor.execute('''
                SELECT 
                    SUBSTR(created_date, 1, 7) as month,
                    COUNT(*) as post_count
                FROM posts
                GROUP BY SUBSTR(created_date, 1, 7)
                ORDER BY month
            ''')
            
            print(f"\nPosts by month:")
            for month, count in cursor.fetchall():
                print(f"  {month}: {count:,} posts")
        
        # Export to CSV
        export_to_csv(conn)
        conn.close()

def export_to_csv(conn):
    """Export database to CSV with metadata"""
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM posts ORDER BY created_at DESC')
    
    ROOT = pathlib.Path.cwd()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = ROOT / "outputs" / f"truth_social_ukraine_comprehensive_{timestamp}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow([desc[0] for desc in cursor.description])
        writer.writerows(cursor.fetchall())
    
    print(f"\n📄 CSV exported to: {out_path}")

# Run the collection
if __name__ == "__main__":
    run_comprehensive_collection()

## Second Pass with Account Discovery

In [None]:
# ─────────── Complete Ukraine/Russia Account Discovery & Historical Scraper ───────────
import os, re, csv, pathlib, random, time, json, sys
from datetime import datetime, timezone
from dateutil import parser as dt_parse
from tqdm.auto import tqdm
import sqlite3

# ========== TUNABLE PARAMETERS FOR SPEED/DEPTH ==========
# Adjust these to control speed vs thoroughness:

# Search parameters
SEARCH_PAGES_PER_QUERY = 30    # Increase to 20-50 for deeper search
SEARCH_LIMIT_PER_PAGE = 70      # Max 100 for faster discovery
SEARCH_DELAY = 0.5              # Decrease to 0.1 for faster (risky)

# Reply collection
COLLECT_REPLIES = True          # Set False to skip reply collection (faster)
MAX_REPLIES_PER_POST = 500      # Increase for more thorough comment mining
REPLY_DELAY = 0.3               # Decrease to 0.1 for faster

# Historical scraping
MAX_POSTS_PER_ACCOUNT = 5000    # Increase to 10000+ for complete history
ACCOUNT_SCRAPE_DELAY = 1        # Decrease to 0.5 for faster
BATCH_PROCESS_SIZE = 500        # How often to show progress

# Date range
START_DATE = "2022-02-20"       # Beginning of data collection
END_DATE = "2025-06-05"         # End of data collection

# ========== KEYWORDS ==========
KEYWORDS = [
    "ukraine", "ukrainian", "україна", "український", "kyiv", "kiev",
    "russia", "russian", "россия", "русский", "moscow", "kremlin",
    "putin", "путин", "zelensky", "zelenskyy", "зеленський",
    "war", "invasion", "conflict", "special operation", "smo",
    "nato", "sanctions", "donbas", "crimea", "mariupol", "bakhmut"
]

SEARCH_QUERIES = [
    "ukraine", "russia", "putin", "zelensky", "ukraine war",
    "special operation", "ukraine invasion", "nato ukraine",
    "sanctions russia", "ukraine aid", "crimea", "donbas",
    "ukraine conflict", "russia invasion", "ukrainian"
]

# ========== DATABASE SETUP ==========
def setup_database():
    db_path = pathlib.Path("ukraine_complete_scrape.db")
    conn = sqlite3.connect(db_path)
    cursor = conn.cursor()
    
    # Posts table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS posts (
            id TEXT PRIMARY KEY,
            created_at TEXT,
            account TEXT,
            text TEXT,
            in_reply_to_id TEXT,
            collection_method TEXT,
            keywords_matched TEXT
        )
    ''')
    
    # Accounts table
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS accounts (
            account TEXT PRIMARY KEY,
            discovered_from TEXT,
            discovery_date TEXT,
            is_active_poster INTEGER,
            is_commenter INTEGER,
            total_posts INTEGER DEFAULT 0
        )
    ''')
    
    conn.commit()
    return conn

# ========== HELPER FUNCTIONS ==========
def clean_text(html):
    if not html:
        return ""
    text = re.sub(r'<[^>]+>', ' ', html)
    text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    return ' '.join(text.split())

def matches_keywords(text):
    if not text:
        return []
    text_lower = text.lower()
    return [k for k in KEYWORDS if k.lower() in text_lower]

def save_account(conn, account_name, source, is_poster=False, is_commenter=False):
    """Save discovered account to database"""
    cursor = conn.cursor()
    cursor.execute('''
        INSERT OR REPLACE INTO accounts (account, discovered_from, discovery_date, is_active_poster, is_commenter)
        VALUES (?, ?, ?, 
                COALESCE((SELECT is_active_poster FROM accounts WHERE account = ?), ?),
                COALESCE((SELECT is_commenter FROM accounts WHERE account = ?), ?))
    ''', (account_name, source, datetime.now().isoformat(), 
          account_name, int(is_poster), account_name, int(is_commenter)))
    conn.commit()

def save_post(conn, post, method):
    """Save post to database"""
    try:
        cursor = conn.cursor()
        text = clean_text(post.get('content', ''))
        keywords = ','.join(matches_keywords(text)[:5])
        
        cursor.execute('''
            INSERT OR IGNORE INTO posts 
            (id, created_at, account, text, in_reply_to_id, collection_method, keywords_matched)
            VALUES (?, ?, ?, ?, ?, ?, ?)
        ''', (post.get('id'), post.get('created_at'), 
              post.get('account', {}).get('acct'), text,
              post.get('in_reply_to_id'), method, keywords))
        conn.commit()
        return cursor.rowcount > 0
    except:
        return False

# ========== PHASE 1: DISCOVERY ==========
def discover_all_accounts(conn):
    """Discover accounts via search and replies"""
    discovered = set()
    posts_with_replies = []
    
    print("="*60)
    print("PHASE 1: ACCOUNT DISCOVERY")
    print("="*60)
    
    # Search discovery
    for query_idx, query in enumerate(SEARCH_QUERIES, 1):
        print(f"\n[{query_idx}/{len(SEARCH_QUERIES)}] Searching: '{query}'")
        posts_found = 0
        accounts_found = 0
        
        try:
            # Try to get multiple pages of results
            total_pages = 0
            max_id = None
            
            for page_attempt in range(SEARCH_PAGES_PER_QUERY):
                try:
                    # Create search with pagination support
                    search_gen = api.search("statuses", query, limit=SEARCH_LIMIT_PER_PAGE)
                    
                    # Convert generator to list to see what we get
                    results = list(search_gen)
                    
                    if not results:
                        break
                    
                    # Process each page in results
                    for page in results:
                        if not page or 'statuses' not in page:
                            continue
                            
                        statuses = page.get('statuses', [])
                        if not statuses:
                            continue
                            
                        total_pages += 1
                        
                        for post in statuses:
                            posts_found += 1
                            
                            # Get poster account
                            account = post.get('account', {})
                            if account.get('acct'):
                                acct = account['acct']
                                discovered.add(acct)
                                save_account(conn, acct, f"search:{query}", is_poster=True)
                                accounts_found += 1
                            
                            # Save post
                            save_post(conn, post, f"search:{query}")
                            
                            # Track posts that might have replies
                            if post.get('replies_count', 0) > 0:
                                posts_with_replies.append((post['id'], acct))
                            
                            # Get mentioned accounts
                            for mention in post.get('mentions', []):
                                if mention.get('acct'):
                                    discovered.add(mention['acct'])
                                    save_account(conn, mention['acct'], f"mention_in:{query}")
                        
                        # Update last ID for potential pagination
                        if statuses:
                            max_id = statuses[-1].get('id')
                    
                    # If we only got one page, search might not paginate
                    if total_pages <= 1:
                        break
                        
                except StopIteration:
                    break
                
                time.sleep(random.uniform(SEARCH_DELAY * 0.5, SEARCH_DELAY * 1.5))
                
        except Exception as e:
            print(f"  Search error: {str(e)[:50]}")            
        print(f"  ✓ Found {posts_found} posts, {accounts_found} poster accounts")
        time.sleep(random.uniform(2, 5))
    
    print(f"\n📊 Total discovered from search: {len(discovered)} accounts")
    
    # Reply discovery
        if COLLECT_REPLIES and posts_with_replies:
            print(f"\n🔍 Collecting replies from {len(posts_with_replies)} posts...")
            reply_accounts = 0
            
            with tqdm(total=min(len(posts_with_replies), 100), desc="Getting replies") as pbar:
                for idx, (post_id, original_poster) in enumerate(posts_with_replies[:100]):
                    try:
                        # Use pull_comments to get replies
                        comments = list(api.pull_comments(post_id=post_id))
                        
                        for comment in comments[:MAX_REPLIES_PER_POST]:
                            # Each comment is a status/post object
                            comment_account = comment.get('account', {})
                            if comment_account.get('acct'):
                                acct = comment_account['acct']
                                if acct not in discovered:
                                    discovered.add(acct)
                                    save_account(conn, acct, f"reply_to:{original_poster}", is_commenter=True)
                                    reply_accounts += 1
                                
                                # Save comment as post
                                save_post(conn, comment, f"reply_to:{post_id}")
                        
                        time.sleep(random.uniform(REPLY_DELAY * 0.5, REPLY_DELAY * 1.5))
                        pbar.update(1)
                        
                    except Exception as e:
                        # Print error for first few to debug
                        if idx < 3:
                            print(f"\n  Comment error for {post_id}: {str(e)[:50]}")
                            
        print(f"  ✓ Found {reply_accounts} additional commenting accounts")
    
    print(f"\n✅ TOTAL UNIQUE ACCOUNTS DISCOVERED: {len(discovered)}")
    return list(discovered)

# ========== PHASE 2: HISTORICAL SCRAPING ==========
def scrape_historical_data(account_list, conn):
    """Scrape historical posts from all discovered accounts"""
    print("\n" + "="*60)
    print("PHASE 2: HISTORICAL DATA COLLECTION")
    print("="*60)
    print(f"Scraping {len(account_list)} accounts back to {START_DATE}\n")
    
    total_posts = 0
    accounts_scraped = 0
    
    for idx, handle in enumerate(account_list, 1):
        try:
            # Clean the handle - remove @ and any whitespace
            handle_clean = handle.strip().lstrip('@')
            
            # Verify account exists using lookup
            try:
                user_info = api.lookup(user_handle=handle_clean)
                if not user_info:
                    print(f" [NOT FOUND]")
                    continue
            except Exception as lookup_error:
                print(f" [LOOKUP ERROR]")
                continue
                
            print(f"[{idx}/{len(account_list)}] @{handle_clean}", end='', flush=True)
            
            # Pull statuses using clean handle
            gen = api.pull_statuses(username=handle_clean, replies=False, verbose=False)
            
            account_posts = 0
            for post in gen:
                # Date check
                created_at = post.get('created_at', '')
                if created_at < START_DATE:
                    break
                if created_at > END_DATE:
                    continue
                
                # Quick keyword check
                text = clean_text(post.get('content', ''))
                if matches_keywords(text):
                    if save_post(conn, post, f"historical:{handle}"):
                        account_posts += 1
                        total_posts += 1
                    
                    # Progress indicator
                    if account_posts % BATCH_PROCESS_SIZE == 0:
                        print(f" {account_posts}", end='', flush=True)
                
                # Stop if hit limit
                if account_posts >= MAX_POSTS_PER_ACCOUNT:
                    break
            
            print(f" → {account_posts} posts")
            accounts_scraped += 1
            
            # Update account post count
            cursor = conn.cursor()
            cursor.execute('UPDATE accounts SET total_posts = ? WHERE account = ?', 
                         (account_posts, handle))
            conn.commit()
            
            time.sleep(random.uniform(ACCOUNT_SCRAPE_DELAY * 0.5, ACCOUNT_SCRAPE_DELAY * 1.5))

            
        except KeyboardInterrupt:
            print("\n⏹ Interrupted by user")
            break
        except Exception as e:
            print(f" ERROR: {str(e)[:30]}")
    
    print(f"\n✅ Scraped {accounts_scraped} accounts, collected {total_posts} posts")
    return total_posts

# ========== EXPORT FUNCTIONS ==========
def export_accounts_csv(conn):
    """Export all discovered accounts to CSV"""
    cursor = conn.cursor()
    cursor.execute('''
        SELECT account, discovered_from, discovery_date, 
               is_active_poster, is_commenter, total_posts
        FROM accounts
        ORDER BY total_posts DESC
    ''')
    
    ROOT = pathlib.Path.cwd()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = ROOT / "outputs" / f"ukraine_accounts_{timestamp}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(['account', 'discovered_from', 'discovery_date', 
                        'is_poster', 'is_commenter', 'total_posts'])
        writer.writerows(cursor.fetchall())
    
    print(f"\n📄 Accounts CSV: {out_path}")
    return out_path

def export_posts_csv(conn):
    """Export all posts to CSV"""
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM posts ORDER BY created_at DESC')
    
    ROOT = pathlib.Path.cwd()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = ROOT / "outputs" / f"ukraine_posts_{timestamp}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'created_at', 'account', 'text', 
                        'in_reply_to_id', 'collection_method', 'keywords_matched'])
        writer.writerows(cursor.fetchall())
    
    print(f"📄 Posts CSV: {out_path}")
    return out_path

# ========== MAIN EXECUTION ==========
def run_complete_scrape():
    """Main execution function"""
    start_time = time.time()
    conn = setup_database()
    
    try:
        # Phase 1: Discover all accounts
        discovered_accounts = discover_all_accounts(conn)
        
        # Export accounts list
        export_accounts_csv(conn)
        
        # Phase 2: Historical scraping
        if discovered_accounts:
            scrape_historical_data(discovered_accounts, conn)
        
        # Final statistics
        cursor = conn.cursor()
        cursor.execute('SELECT COUNT(DISTINCT account) FROM accounts')
        total_accounts = cursor.fetchone()[0]
        
        cursor.execute('SELECT COUNT(*) FROM posts')
        total_posts = cursor.fetchone()[0]
        
        elapsed = (time.time() - start_time) / 60
        
        print(f"\n{'='*60}")
        print(f"COLLECTION COMPLETE")
        print(f"{'='*60}")
        print(f"Total accounts found: {total_accounts}")
        print(f"Total posts collected: {total_posts}")
        print(f"Time taken: {elapsed:.1f} minutes")
        print(f"Rate: {total_posts/elapsed:.1f} posts/minute")
        
        # Export final data
        export_posts_csv(conn)
        
    finally:
        conn.close()

# Run the scraper
if __name__ == "__main__":
    run_complete_scrape()

In [None]:
# ─────────── Historical Scraper with Smart Rate Limiting ───────────
import pandas as pd
import os, re, csv, pathlib, random, time, json, sys
from datetime import datetime, timezone
from dateutil import parser as dt_parse
from tqdm.auto import tqdm
import sqlite3
import truthbrush as tb

# ========== INITIALIZE API FIRST ==========
api = tb.api.Api()
print("✓ API initialized")

# ========== INSTALL SMART RATE LIMITER ==========
# This is the key to faster scraping!
if not hasattr(tb.api.Api, "_get_base"):
    tb.api.Api._get_base = tb.api.Api._get

    def _polite_get(self, url, params=None):
        try:
            resp = self._get_base(url, params)
            
            # Check if we got rate limited by Cloudflare
            if resp and hasattr(resp, 'text') and 'Error 1015' in str(resp.text):
                print("\n🚫 Cloudflare rate limit detected! Backing off...")
                time.sleep(60)  # Wait 1 minute
                return None
                
            # Only delay if we're near the API rate limit
            if (self.ratelimit_remaining is not None
                    and self.ratelimit_remaining <= 10
                    and self.ratelimit_reset):
                wait = max(
                    0,
                    (self.ratelimit_reset -
                     datetime.utcnow().replace(tzinfo=timezone.utc)).total_seconds()
                ) + random.uniform(1, 3)
                print(f"\n📉 Near API rate limit - sleeping {wait:.1f}s")
                time.sleep(wait)
            else:
                # Add small delay to avoid Cloudflare triggers
                time.sleep(random.uniform(0.3, 0.5))  # Slower to avoid Cloudflare
            return resp
        except Exception as e:
            if "1015" in str(e) or "rate limit" in str(e).lower():
                print("\n🚫 Rate limited - waiting 60 seconds...")
                time.sleep(60)
            raise

    tb.api.Api._get = _polite_get
    print("✓ Smart rate limiter installed")
else:
    print("✓ Rate limiter already installed")

# ========== CONFIGURATION ==========
MAX_POSTS_PER_ACCOUNT = 5000    
ACCOUNT_SCRAPE_DELAY = 0.1      # Minimal delay between accounts
BATCH_SIZE = 100                # Batch database commits

# Date range
START_DATE = "2022-02-20"       
END_DATE = "2025-06-05"         

# Keywords
KEYWORDS = [
    "ukraine", "ukrainian", "україна", "український", "kyiv", "kiev",
    "russia", "russian", "россия", "русский", "moscow", "kremlin",
    "putin", "путин", "zelensky", "zelenskyy", "зеленський",
    "war", "invasion", "conflict", "special operation", "smo",
    "nato", "sanctions", "donbas", "crimea", "mariupol", "bakhmut"
]

# Path to your existing accounts CSV
ACCOUNTS_CSV = "/Users/willbeeson/Projects/Courses/GOV20/ukraine-final-project/notebooks/outputs/ukraine_accounts_20250605_141035.csv"

# High-value accounts
HIGH_VALUE_ACCOUNTS = [
    "realDonaldTrump", "DevinNunes", "DonaldJTrumpJr", "EricTrump",
    "TuckerCarlson", "DineshDSouza", "JackPosobiec", "WarRoom",
    "BreitbartNews", "OANN", "KariLake", "MattGaetz"
]

# ========== OPTIMIZED HELPER FUNCTIONS ==========
def setup_database():
    db_path = pathlib.Path("ukraine_historical_scrape.db")
    conn = sqlite3.connect(db_path)
    conn.execute("PRAGMA journal_mode=WAL")  # Faster writes
    conn.execute("PRAGMA synchronous=NORMAL")  # Less disk sync
    cursor = conn.cursor()
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS posts (
            id TEXT PRIMARY KEY,
            created_at TEXT,
            account TEXT,
            text TEXT,
            in_reply_to_id TEXT,
            collection_method TEXT,
            keywords_matched TEXT,
            scraped_at TEXT
        )
    ''')
    
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS scrape_status (
            account TEXT PRIMARY KEY,
            posts_found INTEGER,
            posts_collected INTEGER,
            status TEXT,
            last_updated TEXT
        )
    ''')
    
    conn.commit()
    return conn

def clean_text(html):
    if not html:
        return ""
    text = re.sub(r'<[^>]+>', ' ', html)
    text = text.replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')
    return ' '.join(text.split())

# Precompile keyword patterns for faster matching
KEYWORD_PATTERNS = [k.lower() for k in KEYWORDS]

def matches_keywords(text):
    if not text:
        return []
    text_lower = text.lower()
    return [k for k in KEYWORD_PATTERNS if k in text_lower]

def save_posts_batch(conn, posts_batch):
    """Save multiple posts at once - much faster!"""
    if not posts_batch:
        return 0
    
    cursor = conn.cursor()
    data = []
    
    for post, method in posts_batch:
        text = clean_text(post.get('content', ''))
        keywords = ','.join(matches_keywords(text)[:5])
        
        data.append((
            post.get('id'), 
            post.get('created_at'), 
            post.get('account', {}).get('acct'), 
            text,
            post.get('in_reply_to_id'), 
            method, 
            keywords,
            datetime.now().isoformat()
        ))
    
    cursor.executemany('''
        INSERT OR IGNORE INTO posts 
        (id, created_at, account, text, in_reply_to_id, collection_method, keywords_matched, scraped_at)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?)
    ''', data)
    
    conn.commit()
    return cursor.rowcount

def update_scrape_status(conn, account, posts_found, posts_collected, status):
    cursor = conn.cursor()
    cursor.execute('''
        INSERT OR REPLACE INTO scrape_status 
        (account, posts_found, posts_collected, status, last_updated)
        VALUES (?, ?, ?, ?, ?)
    ''', (account, posts_found, posts_collected, status, datetime.now().isoformat()))
    conn.commit()

# ========== MAIN SCRAPING FUNCTION ==========
def scrape_accounts_from_csv():
    """Load accounts from CSV and scrape their historical posts"""
    
    # Load existing accounts
    print(f"Loading accounts from: {ACCOUNTS_CSV}")
    accounts_df = pd.read_csv(ACCOUNTS_CSV)
    existing_accounts = accounts_df['account'].tolist()
    
    # Add high-value accounts
    all_accounts = list(set(existing_accounts + HIGH_VALUE_ACCOUNTS))
    
    print(f"Total accounts to scrape: {len(all_accounts)}")
    print(f"  - From CSV: {len(existing_accounts)}")
    print(f"  - High-value additions: {len(set(HIGH_VALUE_ACCOUNTS) - set(existing_accounts))}")
    
    # Setup database
    conn = setup_database()
    print("✓ Database ready")
    
    # Check if we've already scraped some accounts
    cursor = conn.cursor()
    cursor.execute('SELECT account FROM scrape_status WHERE status = "completed"')
    already_scraped = {row[0] for row in cursor.fetchall()}
    
    if already_scraped:
        print(f"\nResuming: {len(already_scraped)} accounts already scraped")
        accounts_to_scrape = [a for a in all_accounts if a not in already_scraped]
    else:
        accounts_to_scrape = all_accounts
    
    print(f"\n{'='*60}")
    print(f"HISTORICAL DATA COLLECTION")
    print(f"{'='*60}")
    print(f"Accounts to scrape: {len(accounts_to_scrape)}")
    print(f"Date range: {START_DATE} to {END_DATE}")
    print(f"Max posts per account: {MAX_POSTS_PER_ACCOUNT}")
    print(f"{'='*60}\n")
    
    # Global stats
    global_start_time = time.time()
    total_posts_all_accounts = 0
    successful_accounts = 0
    
    for idx, account in enumerate(accounts_to_scrape, 1):
        try:
            # Clean handle
            handle = account.strip().lstrip('@')
            
            # Show what we're doing (like original)
            print(f"[{idx}/{len(accounts_to_scrape)}] @{handle}", end='', flush=True)
            
            # Verify account
            try:
                user_info = api.lookup(user_handle=handle)
                if not user_info:
                    print(" [NOT FOUND]")
                    update_scrape_status(conn, handle, 0, 0, "not_found")
                    continue
            except Exception as e:
                print(" [LOOKUP ERROR]")
                update_scrape_status(conn, handle, 0, 0, "lookup_error")
                continue
            
            # Space for progress bar
            print("")  
            
            # Pull statuses
            print(f"  → Sending first request...", end='', flush=True)
            gen = api.pull_statuses(username=handle, replies=False, verbose=False)
            
            # Try to get first post to verify generator works
            try:
                first_post = next(gen)
                print(" ✓ Connected")
            except StopIteration:
                print(" ⚠️ No posts found")
                update_scrape_status(conn, handle, 0, 0, "no_posts")
                continue
            
            # Prepend first post back
            def prepend_first(item, iterator):
                yield item
                yield from iterator
            gen = prepend_first(first_post, gen)
            
            # Account-specific counters
            posts_found = 0
            posts_collected = 0
            posts_before_date = 0
            posts_batch = []
            
            # Account rate monitoring
            account_start = time.time()
            
            # Progress bar - simpler format to avoid errors
            pbar = tqdm(desc=f"  @{handle}", unit=" posts", position=0, leave=False)
            
            for post in gen:
                posts_found += 1
                pbar.update(1)
                
                # Date check
                created_at = post.get('created_at', '')
                if created_at < START_DATE:
                    posts_before_date += 1
                    if posts_before_date > 50:
                        pbar.set_description(f"  @{handle} - Reached old posts")
                        break
                    continue
                    
                if created_at > END_DATE:
                    continue
                
                # Keyword check - optimized
                text = clean_text(post.get('content', ''))
                if text and matches_keywords(text):
                    posts_batch.append((post, f"historical:{handle}"))
                    posts_collected += 1
                    total_posts_all_accounts += 1
                    
                    # Batch save
                    if len(posts_batch) >= BATCH_SIZE:
                        save_posts_batch(conn, posts_batch)
                        posts_batch = []
                        pbar.set_description(f"  @{handle} - {posts_collected} collected")
                
                # Update progress with detailed info
                if posts_found % 50 == 0:  # Update more frequently
                    elapsed = time.time() - account_start
                    rate = posts_found / elapsed if elapsed > 0 else 0
                    
                    global_elapsed = time.time() - global_start_time
                    global_rate = total_posts_all_accounts / global_elapsed if global_elapsed > 0 else 0
                    
                    # Update postfix with current stats
                    pbar.set_postfix_str(f"collected: {posts_collected} | {rate:.1f} posts/s | global: {global_rate:.1f} p/s")
                    
                    # Also show date range being processed
                    pbar.set_description(f"  @{handle} [{created_at[:10] if created_at else 'N/A'}]")
                
                # Limit check
                if posts_collected >= MAX_POSTS_PER_ACCOUNT:
                    pbar.set_description(f"  @{handle} - Hit collection limit")
                    break
                    
                if posts_found >= 10000:
                    pbar.set_description(f"  @{handle} - Hit scan limit")
                    break
            
            # Save remaining batch
            if posts_batch:
                save_posts_batch(conn, posts_batch)
            
            pbar.close()
            
            # Account summary (like original format)
            account_time = time.time() - account_start
            account_rate = posts_found / account_time if account_time > 0 else 0
            
            print(f"  ✓ Collected: {posts_collected}/{posts_found} posts [{account_rate:.1f} posts/s]")
            
            # Global progress update
            global_elapsed = time.time() - global_start_time
            global_rate = total_posts_all_accounts / global_elapsed if global_elapsed > 0 else 0
            print(f"  📊 Total progress: {total_posts_all_accounts} posts from {successful_accounts + 1} accounts [{global_rate:.1f} posts/s overall]\n")
            
            successful_accounts += 1
            update_scrape_status(conn, handle, posts_found, posts_collected, "completed")
            
            # NO delay between accounts - go straight to next one
            # time.sleep(random.uniform(ACCOUNT_SCRAPE_DELAY * 0.5, ACCOUNT_SCRAPE_DELAY * 1.5))
            
        except KeyboardInterrupt:
            print("\n\n⏹ Interrupted - progress saved")
            # Save any remaining batch
            if 'posts_batch' in locals() and posts_batch:
                save_posts_batch(conn, posts_batch)
            break
        except Exception as e:
            print(f" ERROR: {str(e)[:100]}")
            update_scrape_status(conn, handle, 0, 0, f"error: {str(e)[:50]}")
    
    # Final stats
    total_time = time.time() - global_start_time
    overall_rate = total_posts_all_accounts / total_time if total_time > 0 else 0
    
    print(f"\n{'='*60}")
    print(f"COLLECTION COMPLETE")
    print(f"{'='*60}")
    
    cursor = conn.cursor()
    cursor.execute('SELECT COUNT(*) FROM posts')
    total_in_db = cursor.fetchone()[0]
    
    cursor.execute('SELECT COUNT(DISTINCT account) FROM posts')
    unique_accounts = cursor.fetchone()[0]
    
    print(f"Total posts collected: {total_in_db}")
    print(f"Unique accounts with posts: {unique_accounts}")
    print(f"Accounts successfully scraped: {successful_accounts}")
    print(f"Overall rate: {overall_rate:.1f} posts/second")
    print(f"Total time: {total_time/60:.1f} minutes")
    
    # Export
    export_posts_csv(conn)
    export_status_csv(conn)
    
    conn.close()

def export_posts_csv(conn):
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM posts ORDER BY created_at DESC')
    
    ROOT = pathlib.Path.cwd()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = ROOT / "outputs" / f"ukraine_historical_posts_{timestamp}.csv"
    out_path.parent.mkdir(parents=True, exist_ok=True)
    
    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(['id', 'created_at', 'account', 'text', 
                        'in_reply_to_id', 'collection_method', 'keywords_matched', 'scraped_at'])
        writer.writerows(cursor.fetchall())
    
    print(f"\n📄 Posts CSV: {out_path}")

def export_status_csv(conn):
    cursor = conn.cursor()
    cursor.execute('SELECT * FROM scrape_status ORDER BY posts_collected DESC')
    
    ROOT = pathlib.Path.cwd()
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    out_path = ROOT / "outputs" / f"scrape_status_{timestamp}.csv"
    
    with out_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(['account', 'posts_found', 'posts_collected', 'status', 'last_updated'])
        writer.writerows(cursor.fetchall())
    
    print(f"📄 Status CSV: {out_path}")

# Run it
if __name__ == "__main__":
    scrape_accounts_from_csv()

In [None]:
# Add this debug function to check available methods:
def check_api_methods():
    print("\nAvailable API methods:")
    for method in dir(api):
        if not method.startswith('_') and callable(getattr(api, method)):
            print(f"  - {method}")

# Call it before Phase 1:
check_api_methods()

In [None]:
#!/usr/bin/env python3
"""
Simple Truth Social Ukraine Scraper
Just run this single file to collect Ukraine-related posts fast!
"""

import truthbrush as tb
import pandas as pd
import time
from datetime import datetime
import csv
import os

print("🚀 Truth Social Ukraine Speed Scraper")
print("="*50)

# Initialize API
api = tb.api.Api()
print("✓ API connected")

# Keywords to search
KEYWORDS = ["ukraine", "russia", "putin", "zelensky", "war", "nato", "kyiv", "invasion", "sanctions", "missile"]

# Storage
all_posts = []
seen_ids = set()

def clean_text(html):
    """Remove HTML tags"""
    import re
    if not html:
        return ""
    return re.sub(r'<[^>]+>', ' ', html).strip()

def save_posts(posts, filename):
    """Save posts to CSV"""
    if not posts:
        return
    
    df = pd.DataFrame(posts)
    df.to_csv(filename, index=False)
    print(f"💾 Saved {len(posts)} posts to {filename}")

# MAIN COLLECTION LOOP
print(f"\n📡 Starting collection at {datetime.now()}")
start_time = time.time()

try:
    # 1. Search for Ukraine posts
    print("\n🔍 Phase 1: Searching for Ukraine content...")
    
    for i, keyword in enumerate(KEYWORDS):
        print(f"  Searching: {keyword} ({i+1}/{len(KEYWORDS)})")
        
        try:
            # Use the search API
            results = api.search(searchtype="statuses", query=keyword, limit=40)
            
            # Handle the results
            if hasattr(results, '__iter__'):
                for page in results:
                    if isinstance(page, dict) and 'statuses' in page:
                        posts = page['statuses']
                    else:
                        continue
                    
                    for post in posts:
                        if post.get('id') not in seen_ids:
                            seen_ids.add(post.get('id'))
                            
                            # Extract data
                            post_data = {
                                'id': post.get('id'),
                                'created_at': post.get('created_at'),
                                'account': post.get('account', {}).get('acct', ''),
                                'text': clean_text(post.get('content', '')),
                                'url': post.get('url', ''),
                                'reblogs_count': post.get('reblogs_count', 0),
                                'favourites_count': post.get('favourites_count', 0),
                                'keyword': keyword
                            }
                            all_posts.append(post_data)
                    
                    # Progress
                    if len(all_posts) % 100 == 0:
                        elapsed = time.time() - start_time
                        rate = len(all_posts) / elapsed
                        print(f"    📊 {len(all_posts)} posts | {rate:.1f} posts/sec")
                    
                    # Stop if we have enough from this keyword
                    if len(all_posts) > 500 * (i + 1):
                        break
                        
        except Exception as e:
            print(f"    ⚠️  Error with {keyword}: {str(e)[:50]}")
            continue
        
        # Small delay between keywords
        time.sleep(0.5)
    
    # 2. Get trending posts
    print("\n📈 Phase 2: Checking trending posts...")
    try:
        trends = api.trending(limit=20)
        ukraine_trends = 0
        
        for post in trends:
            text = clean_text(post.get('content', ''))
            if any(kw in text.lower() for kw in KEYWORDS):
                if post.get('id') not in seen_ids:
                    seen_ids.add(post.get('id'))
                    post_data = {
                        'id': post.get('id'),
                        'created_at': post.get('created_at'),
                        'account': post.get('account', {}).get('acct', ''),
                        'text': text,
                        'url': post.get('url', ''),
                        'reblogs_count': post.get('reblogs_count', 0),
                        'favourites_count': post.get('favourites_count', 0),
                        'keyword': 'trending'
                    }
                    all_posts.append(post_data)
                    ukraine_trends += 1
        
        print(f"  ✓ Found {ukraine_trends} trending Ukraine posts")
    except:
        print("  ⚠️  Could not get trending posts")
    
    # 3. Quick user scraping for top accounts
    print("\n👥 Phase 3: Checking top Ukraine accounts...")
    
    # Find most active accounts from our posts
    account_counts = {}
    for post in all_posts:
        acc = post.get('account', '')
        if acc:
            account_counts[acc] = account_counts.get(acc, 0) + 1
    
    # Get top 10 most active accounts
    top_accounts = sorted(account_counts.items(), key=lambda x: x[1], reverse=True)[:10]
    
    for account, count in top_accounts:
        print(f"  Checking @{account} ({count} posts found)...")
        
        try:
            posts_checked = 0
            for post in api.pull_statuses(account, replies=False, verbose=False):
                posts_checked += 1
                
                # Quick Ukraine check
                text = clean_text(post.get('content', ''))
                if any(kw in text.lower() for kw in KEYWORDS):
                    if post.get('id') not in seen_ids:
                        seen_ids.add(post.get('id'))
                        post_data = {
                            'id': post.get('id'),
                            'created_at': post.get('created_at'),
                            'account': account,
                            'text': text,
                            'url': post.get('url', ''),
                            'reblogs_count': post.get('reblogs_count', 0),
                            'favourites_count': post.get('favourites_count', 0),
                            'keyword': f'user:{account}'
                        }
                        all_posts.append(post_data)
                
                # Don't check too many per user
                if posts_checked > 100:
                    break
                    
        except:
            continue
        
        time.sleep(0.5)  # Be nice to the API

except KeyboardInterrupt:
    print("\n\n⏹️  Stopped by user")
except Exception as e:
    print(f"\n❌ Error: {e}")

# Final stats and save
elapsed = time.time() - start_time
print(f"\n{'='*50}")
print(f"📊 COLLECTION COMPLETE")
print(f"{'='*50}")
print(f"Total posts: {len(all_posts)}")
print(f"Unique posts: {len(seen_ids)}")
print(f"Time: {elapsed/60:.1f} minutes")
print(f"Rate: {len(all_posts)/elapsed:.1f} posts/second")

# Save to CSV
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f"ukraine_truth_social_{timestamp}.csv"
save_posts(all_posts, filename)

print(f"\n✅ Done! Your data is in: {filename}")

# Quick stats
if all_posts:
    df = pd.DataFrame(all_posts)
    print(f"\n📈 Quick Stats:")
    print(f"  - Date range: {df['created_at'].min()} to {df['created_at'].max()}")
    print(f"  - Top accounts: {df['account'].value_counts().head(5).to_dict()}")
    print(f"  - Most reblogged: {df.nlargest(3, 'reblogs_count')[['text', 'reblogs_count']].values[0]}")