In [None]:
import os, json, time, requests, pandas as pd, numpy as np
from datetime import datetime, timedelta
import matplotlib.pyplot as plt
from openai import OpenAI

In [None]:
# --- .env loader that works in notebooks ------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys

def find_repo_root(start: Path, marker=".git") -> Path:
    """Walk up until we see a folder containing the given marker ('.git' or '.env')."""
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / marker).exists():
            return cur
        cur = cur.parent
    raise FileNotFoundError(f"Repository root with {marker} not found from {start}")

# 1) locate repo root (folder that has .env **or** .git)
repo_root = find_repo_root(Path.cwd(), ".env")

# 2) load environment variables
load_dotenv(repo_root / ".env")

# 3) add src/ to Python path (optional, if you’ll import from src/)
src_path = repo_root / "src"
if src_path.exists():
    sys.path.append(str(src_path))

# 4) fetch secrets (raise fast if any missing)
REQUIRED = ["OPENAI_API_KEY", "NEWSAPI_KEY"]
CREDS = {k: os.getenv(k) for k in REQUIRED}
missing = [k for k, v in CREDS.items() if not v]
if missing:
    raise RuntimeError(f"Missing secrets in .env: {', '.join(missing)}")

# handy variables
OPENAI_KEY       = CREDS["OPENAI_API_KEY"]
NEWSAPI_KEY      = CREDS["NEWSAPI_KEY"]
REDDIT_CLIENT_ID = os.getenv("REDDIT_CLIENT_ID")
REDDIT_SECRET    = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_AGENT     = os.getenv("REDDIT_USER_AGENT")

print(f"✅  .env loaded from {repo_root}")

In [None]:
# --- repo bootstrap ---------------------------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys

def repo_root(start: Path) -> Path:
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / ".env").exists() or (cur / ".git").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("repo root not found")

ROOT = repo_root(Path.cwd())
load_dotenv(ROOT / ".env")             # loads secrets
sys.path.append(str(ROOT / "src"))     # optional helpers

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs"
FIG_DIR  = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True)

print("Repo root:", ROOT)

### Fetch & cache headlines

In [None]:
# ---------------------------------------------------------------
# 1.  Fetch & cache *all* Ukraine‑related headlines (24 Feb 2022 → Apr 20 2025)
#      – pulls from EVERY English source in NewsAPI, paged day‑by‑day
# ---------------------------------------------------------------
import sys, time, urllib.parse, requests, pandas as pd
from datetime import datetime, timedelta

NEWSAPI_KEY = os.getenv("NEWSAPI_KEY")        # be sure it is set
assert NEWSAPI_KEY, "➡️  set NEWSAPI_KEY env‑var first!"

# date range: full war period
start_date = datetime(2025, 5, 27)            # day after last available
end_date   = datetime(2025, 6, 6)            # “now”

# broad query that catches > 99 % of RU–UA war pieces
base_query = (
    'ukraine OR kyiv OR kiev OR lviv OR odessa OR donbas OR donbass '
    'OR "volodymyr zelensky" OR zelenskyy OR putin OR russia OR drone OR "trojan horse" OR russian OR ukranian OR war'
)
ENC_QUERY = urllib.parse.quote_plus(base_query)

DAY_URL = (
    'https://newsapi.org/v2/everything?'
    'q={q}&from={f}&to={t}&language=en&sortBy=publishedAt&pageSize=100&page={pg}'
)

def daterange(start, end):
    for n in range((end - start).days + 1):
        yield start + timedelta(n)

records = []
total   = 0
for day in daterange(start_date, end_date):
    d_str = day.strftime('%Y-%m-%d')
    next_str = (day + timedelta(1)).strftime('%Y-%m-%d')

    pg = 1
    while True:
        url  = DAY_URL.format(q=ENC_QUERY, f=d_str, t=next_str, pg=pg)
        resp = requests.get(url, headers={'X-Api-Key': NEWSAPI_KEY})
        if resp.status_code != 200:
            sys.stderr.write(f"\n⚠️  {d_str} page {pg} → {resp.status_code}: {resp.json().get('message','')}\n")
            break

        arts = resp.json().get('articles', [])
        if not arts:
            break

        for a in arts:
            records.append({
                "date"  : d_str,
                "source": a["source"]["id"] or "unknown",
                "title" : a["title"]
            })
        total += len(arts)
        if len(arts) < 100:                      # last page
            break
        pg += 1
        time.sleep(0.3)                          # stay well under 30 req/min

    sys.stdout.write(f"\r{d_str} ✓ {total:,} headlines so far"); sys.stdout.flush()

print(f"\nDone! Collected {total:,} headlines.")
df = pd.DataFrame(records).drop_duplicates()
df.to_csv("raw_headlines_5_27_to_6_6.csv", index=False)
df.head()

### Fast relevance filter (RU–UA headlines ➜ outputs/headlines_ru_ua.csv)

In [None]:
# ------------------------------------------------------------
#  FAST FILTER — split into ≤50 000-request chunks, show progress,
#                auto-download & save war-only CSV
# ------------------------------------------------------------
from pathlib import Path
import json, time, datetime as dt
import pandas as pd, openai
from tqdm.auto import tqdm

MAX_REQ   = 40_000
RAW_CSV   = ROOT / "outputs" / "raw_headlines_5_27_to_6_6.csv"
CHUNK_DIR = ROOT / "outputs" / "filter_chunks"
OUT_CSV   = ROOT / "outputs" / "headlines_ru_ua.csv"
CHUNK_DIR.mkdir(exist_ok=True)

# ---------- load full set ------------------------------------
df = pd.read_csv(RAW_CSV, parse_dates=["date"])
print("📄 loaded", len(df), "headlines")

# ---------- build chunked JSONL files ------------------------
chunk_paths, part_no, line_in_chunk = [], 1, 0
pbar = tqdm(total=len(df), desc="Writing JSONL chunks", unit="lines")

def new_writer(n: int):
    p = CHUNK_DIR / f"filter_tasks_part{n:02d}.jsonl"
    return p.open("w"), p

writer, cur_path = new_writer(part_no)

for idx, title in df["title"].items():
    writer.write(json.dumps({
        "custom_id": str(idx),
        "method":   "POST",
        "url":      "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "temperature": 0,
            "max_tokens": 1,
            "messages": [
                {"role": "system",
                 "content": "Return YES if the headline is about/related to the Russia-Ukraine war, otherwise NO."},
                {"role": "user",
                 "content": ("" if pd.isna(title) else str(title))[:500]}
            ]
        }
    }) + "\n")
    line_in_chunk += 1
    pbar.update(1)

    if line_in_chunk >= MAX_REQ:
        writer.close()
        chunk_paths.append(cur_path)
        part_no += 1
        writer, cur_path = new_writer(part_no)
        line_in_chunk = 0

writer.close()
chunk_paths.append(cur_path)
pbar.close()
print(f"✅ built {len(chunk_paths)} chunk(s) – each ≤{MAX_REQ:,} requests")

# ---------- launch the batches --------------------------------
client = openai.OpenAI()
batch_ids = []

for p in chunk_paths:
    fid   = client.files.create(file=open(p, "rb"), purpose="batch").id
    batch = client.batches.create(
              input_file_id=fid,
              endpoint="/v1/chat/completions",
              completion_window="24h")
    batch_ids.append(batch.id)

    # live progress for this batch
    bar = tqdm(total=MAX_REQ if p != chunk_paths[-1] else None,
               desc=f"{p.name}  → {batch.id[:8]}",
               unit="req")

    while True:
        b  = client.batches.retrieve(batch.id)
        rc = b.request_counts
        bar.total = rc.total or bar.total         # fill in when known
        bar.n     = rc.completed + rc.failed
        bar.refresh()

        if b.status == "completed":
            bar.close()
            print("✅", b.id, "completed")
            break
        if b.status == "failed":
            bar.close()
            raise RuntimeError(f"Batch {b.id} failed – check dashboard")
        time.sleep(15)

print("\n🎉 all batches finished:", ", ".join(batch_ids))

# ---------- download, filter & save ---------------------------
yes_ids = set()
for bid in batch_ids:
    out_id  = client.batches.retrieve(bid).output_file_id
    content = client.files.content(out_id).content
    for line in content.splitlines():
        rec = json.loads(line)
        if rec["response"]["body"]["choices"][0]["message"]["content"].strip().upper().startswith("Y"):
            yes_ids.add(int(rec["custom_id"]))

df_yes = df.loc[yes_ids].reset_index(drop=True)
df_yes.to_csv(OUT_CSV, index=False)
print(f"🎯 kept {len(df_yes):,}/{len(df)} headlines – saved →", OUT_CSV.relative_to(ROOT))

#### Compile Filtered Headlines

In [None]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  Merge 3 filter-batches  →  outputs/headlines_ru_ua.csv      ║
# ╚══════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, pandas as pd, openai, datetime as dt

ROOT      = Path.cwd().resolve().parents[0]          # adjust if needed
RAW_CSV   = ROOT / "outputs" / "raw_headlines.csv"
OUT_CSV   = ROOT / "outputs" / "headlines_ru_ua.csv"

BATCH_IDS = [
    "batch_6836686d268081909e722fc9116cda78",  # part-01
    "batch_68368e492f708190b81b77cad2380411",  # part-02
    "batch_6836ae30c38c8190833297ae74f6ac27",  # part-03
]

client   = openai.OpenAI()
yes_ids  = set()

print("⬇️  downloading outputs & collecting YES ids …")
for bid in BATCH_IDS:
    b = client.batches.retrieve(bid)
    assert b.status == "completed", f"{bid} not completed (status={b.status})"
    out_txt = client.files.content(b.output_file_id).content
    for ln in out_txt.splitlines():
        rec = json.loads(ln)
        if rec["response"]["body"]["choices"][0]["message"]["content"] \
               .strip().upper().startswith("Y"):
            yes_ids.add(int(rec["custom_id"]))

# ---------- filter master CSV -----------------------------------
df_all = pd.read_csv(RAW_CSV, parse_dates=["date"])
df_yes = df_all.loc[sorted(yes_ids)].reset_index(drop=True)   # ← list, not set
df_yes.to_csv(OUT_CSV, index=False)

# ---------- summary ---------------------------------------------
n_in  = len(df_all)
n_out = len(df_yes)
print(f"\n📊 Filter summary  ({dt.datetime.now():%Y-%m-%d %H:%M})")
print(f" • total headlines analysed : {n_in:,}")
print(f" • war-related kept         : {n_out:,}  ({n_out/n_in*100:0.1f} %)")
print(f" • irrelevant discarded     : {n_in - n_out:,}")
print(f"✅  Saved → {OUT_CSV.relative_to(ROOT)}")

# ---------- quick peek ------------------------------------------
display(df_yes.sample(5, random_state=1))

### Pull 750 Headlines to Create Gold Set

In [None]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  Draft gold-set sampler – evenly-spaced 750 headlines        ║
# ╚══════════════════════════════════════════════════════════════╝
from pathlib import Path
import pandas as pd
import numpy as np

ROOT       = Path.cwd().resolve().parents[0]
SRC_CSV    = ROOT / "outputs" / "headlines_ru_ua.csv"
GOLD_CSV   = ROOT / "outputs" / "gold_candidates_750_v2.csv"
TARGET_N   = 750                     # ~one headline every 1½ days

# ---------- load & prep -----------------------------------------
df = pd.read_csv(SRC_CSV, parse_dates=["date"]).sort_values("date")
span_days = (df["date"].iloc[-1] - df["date"].iloc[0]).days
step      = span_days / TARGET_N

# ---------- even-time sampling ----------------------------------
want_dates = pd.date_range(df["date"].min(),
                           df["date"].max(),
                           periods=TARGET_N).date

# pick the headline closest to each desired date (within ±3 days)
picked_idx = []
for d in want_dates:
    window = df.loc[(df["date"].dt.date - d).abs().sort_values().index]
    # take the first *unused* headline in that window
    for idx in window.index:
        if idx not in picked_idx:
            picked_idx.append(idx)
            break

gold_df = (df.loc[picked_idx]
             .sort_values("date")
             .reset_index(drop=True))

gold_df.to_csv(GOLD_CSV, index=False)
print(f"✅ wrote {len(gold_df)} rows → {GOLD_CSV.relative_to(ROOT)}")
display(gold_df.head(10))            # quick peek


### Quick headline-density plot (daily count + global average)

In [None]:
# ╔══════════════════════════════════════════════════════════════╗
# ║  Daily headline count  • dashed global average               ║
# ╚══════════════════════════════════════════════════════════════╝
import matplotlib.pyplot as plt
import pandas as pd
from pathlib import Path

ROOT     = Path.cwd().resolve().parents[0]
SRC_CSV  = ROOT / "outputs" / "headlines_ru_ua.csv"

df = pd.read_csv(SRC_CSV, parse_dates=["date"])
daily = (df.groupby(df["date"].dt.date)
           .size()
           .rename("n"))

avg_per_day = daily.mean()

plt.figure(figsize=(12,4))
plt.plot(daily.index, daily, lw=1, alpha=.7)
plt.axhline(avg_per_day, ls="--", c="tab:red",
            label=f"global avg = {avg_per_day:0.1f} / day")
plt.title("Number of war-related headlines per day")
plt.ylabel("Count")
plt.legend()
plt.tight_layout()
plt.show()

print(f"⏱️  date span : {daily.index.min()} → {daily.index.max()}")
print(f"📊  mean      : {avg_per_day:0.2f}  headlines / day")
print(f"🔢  total     : {daily.sum():,}")
