In [None]:
# --- repo bootstrap ---------------------------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys

def repo_root(start: Path) -> Path:
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / ".env").exists() or (cur / ".git").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("repo root not found")

ROOT = repo_root(Path.cwd())
load_dotenv(ROOT / ".env")             # loads secrets
sys.path.append(str(ROOT / "src"))     # optional helpers

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs"
FIG_DIR  = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True)

print("Repo root:", ROOT)

In [None]:
# ── GOLD-SET CANDIDATES  ──────────────────────────────────────────────
import pandas as pd, numpy as np
from pathlib import Path

ROOT   = Path.cwd().parents[0]        # adjust if you’re already at repo root
RAW_TG = ROOT / "outputs" / "telegram_full_20250605_213258.csv"
OUT    = ROOT / "outputs" / "telegram_gold_candidates_300.csv"

df = pd.read_csv(RAW_TG)

# freeze RNG for reproducibility
sample_df = df.sample(n=300, random_state=42).reset_index(drop=True)
sample_df.to_csv(OUT, index=False)

print(f"✅ wrote 300-row sample → {OUT.relative_to(ROOT)}")
sample_df.head()


### Score Telegram

In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  SCORE Telegram Messages - Anthropic Batch API                        ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, time, pandas as pd, tqdm, re
import anthropic
from dotenv import load_dotenv
import os
import requests
import logging
from datetime import datetime
import numpy as np

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
TELEGRAM_CSV = ROOT / "outputs" / "telegram_full_20250605_213258.csv"
OUT_DIR = ROOT / "outputs" / "telegram_scoring"
OUT_DIR.mkdir(exist_ok=True)

# Model selection
#MODEL = "claude-3-5-sonnet-20241022"  # Fast and good for large batches
# MODEL = "claude-3-5-haiku-20241022"  # Even faster, still good
MODEL = "claude-sonnet-4-20250514"   # Latest, if you want newest

# Batch configuration
BATCH_SIZE = 15000  # Safe batch size based on your 41k success
MAX_RETRIES = 3

# ═══════════════════════════════════════════════════════════════════════
# SCORING RUBRIC - Multi-dimensional like Truth Social
# ═══════════════════════════════════════════════════════════════════════

TELEGRAM_PROMPT = """You are analyzing Telegram messages about the Russia-Ukraine war.
Score each message on FOUR dimensions:

1. Escalation (0-10): How militarily escalatory is the message?
2. Blame (-1, 0, 1): Who does the message blame?
   - 0 = Blames Ukraine/NATO/West
   - 1 = Blames Russia/Putin  
   - -1 = No clear blame/neutral reporting
3. Propaganda (0-3): Level of propaganda/manipulation
   - 0 = Factual reporting
   - 1 = Mild spin/framing
   - 2 = Clear propaganda
   - 3 = Extreme propaganda/disinformation
4. Call-to-Action (0, 1): Does it urge concrete action?
   - 1 = Yes (donate, volunteer, protest, etc.)
   - 0 = No explicit call

Escalation scale reference:
0: Humanitarian/diplomatic content
1: Peace negotiation focus  
2: Political warfare/disinformation
3: Routine warfare updates
4: Major aid package discussions
5: Major weapons systems
6: Advanced weapons escalation
7: Nation-wide strikes
8: Strategic infrastructure attacks
9: Nuclear facility/weapons threats
10: Direct nuclear war rhetoric

---

# Real message examples (message → E,B,P,C):

- "Гуманитарная помощь доставлена в освобожденные районы"                                → 0,-1,0,0
- "Zelensky's advisor confirms Ukraine ready for talks: "We're not against negotiations, but they must be based on international law and Ukraine's sovereignty."" → 1,1,0,0
- "🇺🇸🇺🇦The United States will be transferring Patriot air defense systems to Ukraine, — Kirby
The White House confirmed that Patriots are included in the aid package that Biden will announce." → 5,1,0,0
- "🇬🇧🇺🇦 Great Britain secretly supplied Ukraine with Storm Shadow cruise missiles, writes CNN.
According to the publication, the deliveries took place a few weeks ago. The UK has asked Ukraine not to use missiles to attack facilities on the territory of the Russian Federation.
It is noted that Storm Shadow has a range of more than 250 km." → 6,1,1,0
- "Biden pushing us to WW3, must negotiate NOW!"                                         → 8,0,2,1  
- "Российские войска нанесли удар по складу ВСУ в Харькове"                             → 3,1,1,0
- "⚡️BREAKING: Nuclear plant under attack! NATO must act!"                               → 9,0,3,1
- "Сегодня ВСУ обстреляли жилые кварталы Донецка. 5 погибших."                          → 4,0,2,0
- "Отчет с передовой: позиции удерживаются, враг несет потери"                          → 3,-1,1,0
- "DONATE NOW to help Ukrainian defenders! Every dollar saves lives!"                     → 2,1,1,1
- "Путин угрожает ядерным оружием если НАТО вмешается"                                  → 10,1,2,0
- "💥💥💥 Explosions all over Ukraine

Explosions in:
Vinnytsia;
The Vinnitsa region;
Dnipro;
Zaporizhia;
Kirovograd;
Kharkov;
Odessa;
N.....

Air raid throughout Ukraine." → 7,0,1,0

---

CRITICAL: Respond ONLY with four integers in format E,B,P,C
No spaces, no explanations - just four numbers with three commas.
Example: 5,0,2,1"""

# ═══════════════════════════════════════════════════════════════════════
# LOAD AND PREPARE DATA
# ═══════════════════════════════════════════════════════════════════════

print("📊 Loading Telegram data...")
df = pd.read_csv(TELEGRAM_CSV)
print(f"✅ Loaded {len(df):,} messages")

# Data validation
df = df[df['message_text'].notna()].copy()
df = df[df['message_text'].str.strip() != ''].copy()
print(f"✅ {len(df):,} messages with valid text")

# Add tracking index
df['global_idx'] = range(len(df))

# Show category distribution
print("\n📈 Message distribution by category:")
category_counts = df['channel_category'].value_counts()
for cat, count in category_counts.items():
    print(f"   {cat}: {count:,} ({count/len(df)*100:.1f}%)")

# ═══════════════════════════════════════════════════════════════════════
# BATCH PROCESSING FUNCTIONS
# ═══════════════════════════════════════════════════════════════════════

def create_batches(df, batch_size=BATCH_SIZE):
    """Split dataframe into batches"""
    n_batches = (len(df) + batch_size - 1) // batch_size
    batches = []
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx].copy()
        batch_df['batch_num'] = i + 1
        batch_df['batch_idx'] = range(len(batch_df))
        batches.append(batch_df)
    
    return batches

def prepare_batch_requests(batch_df):
    """Prepare requests for a batch"""
    requests_list = []
    
    for _, row in batch_df.iterrows():
        # Truncate very long messages
        text = str(row['message_text'])[:1500]
        
        # Add channel category context
        context = f"[Channel: {row['channel_username']} ({row['channel_category']})]\\n{text}"
        
        request = {
            "custom_id": str(row['batch_idx']),
            "params": {
                "model": MODEL,
                "max_tokens": 15,  # Just need E,B,P,C
                "temperature": 0,
                "system": TELEGRAM_PROMPT,
                "messages": [
                    {"role": "user", "content": context}
                ]
            }
        }
        requests_list.append(request)
    
    return requests_list

def process_batch_results(batch_df, results_data):
    """Parse results and add to dataframe"""
    scores = {
        "escalation": {},
        "blame": {},
        "propaganda": {},
        "cta": {}
    }
    parse_errors = []
    
    for line in results_data:
        if not line:
            continue
            
        try:
            result = json.loads(line)
            custom_id = result.get("custom_id")
            
            if custom_id is None:
                continue
                
            idx = int(custom_id)
            
            # Check if request succeeded
            if result.get("result", {}).get("type") != "succeeded":
                parse_errors.append(f"Request {custom_id} failed")
                continue
            
            # Extract response
            message_content = result["result"]["message"]["content"][0]["text"].strip()
            
            # Parse E,B,P,C format
            match = re.match(r'^(\d+),(-?\d+),(\d+),(\d+)', message_content)
            
            if match:
                e = int(match.group(1))
                b = int(match.group(2))
                p = int(match.group(3))
                c = int(match.group(4))
                
                # Validate ranges
                if 0 <= e <= 10:
                    scores["escalation"][idx] = e
                if b in (-1, 0, 1):
                    scores["blame"][idx] = b
                if 0 <= p <= 3:
                    scores["propaganda"][idx] = p
                if c in (0, 1):
                    scores["cta"][idx] = c
            else:
                parse_errors.append(f"Parse error for {custom_id}: {message_content}")
                
        except Exception as e:
            parse_errors.append(f"Error: {str(e)}")
    
    # Map scores to dataframe
    batch_df["escalation_score"] = batch_df["batch_idx"].map(scores["escalation"]).astype("Int64")
    batch_df["blame_direction"] = batch_df["batch_idx"].map(scores["blame"]).astype("Int64")
    batch_df["propaganda_level"] = batch_df["batch_idx"].map(scores["propaganda"]).astype("Int64")
    batch_df["has_cta"] = batch_df["batch_idx"].map(scores["cta"]).astype("Int64")
    
    return batch_df, len(scores["escalation"]), parse_errors

# ═══════════════════════════════════════════════════════════════════════
# MAIN PROCESSING LOOP
# ═══════════════════════════════════════════════════════════════════════

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Quiet logging
for name in ("httpx", "anthropic"):
    logging.getLogger(name).setLevel(logging.WARNING)

# Create batches
batches = create_batches(df)
print(f"\n🔄 Created {len(batches)} batches")

all_results = []
total_processed = 0
total_errors = 0

# Process each batch
for batch_num, batch_df in enumerate(batches, 1):
    print(f"\n{'='*60}")
    print(f"📦 Processing Batch {batch_num}/{len(batches)}")
    print(f"   Messages: {len(batch_df):,}")
    print(f"   Categories: {batch_df['channel_category'].value_counts().to_dict()}")
    
    # Prepare requests
    requests_list = prepare_batch_requests(batch_df)
    print(f"   Prepared {len(requests_list)} requests")
    
    # Create Anthropic batch
    try:
        batch = client.messages.batches.create(requests=requests_list)
        print(f"   🚀 Launched batch {batch.id}")
        
        # Monitor progress
        bar = tqdm.tqdm(total=len(requests_list), desc=f"Batch {batch_num}", unit="msg")
        start_time = time.time()
        
        while True:
            batch_status = client.messages.batches.retrieve(batch.id)
            completed = (batch_status.request_counts.succeeded + 
                        batch_status.request_counts.errored + 
                        batch_status.request_counts.canceled + 
                        batch_status.request_counts.expired)
            bar.n = completed
            bar.refresh()
            
            if batch_status.processing_status == "ended":
                bar.close()
                break
                
            time.sleep(5)
        
        elapsed_time = time.time() - start_time
        print(f"   ✅ Batch complete in {elapsed_time/60:.1f} minutes")
        
        # Retrieve results
        batch_final = client.messages.batches.retrieve(batch.id)
        
        if batch_final.results_url:
            headers = {
                "x-api-key": os.getenv("ANTHROPIC_API_KEY"),
                "anthropic-version": "2023-06-01"
            }
            
            response = requests.get(batch_final.results_url, headers=headers, stream=True)
            
            if response.status_code == 200:
                results_data = [line.decode('utf-8') for line in response.iter_lines()]
                
                # Process results
                batch_df_scored, n_success, errors = process_batch_results(batch_df, results_data)
                
                print(f"   ✅ Scored {n_success:,}/{len(batch_df):,} messages")
                if errors:
                    print(f"   ⚠️  {len(errors)} parse errors")
                    total_errors += len(errors)
                
                total_processed += n_success
                all_results.append(batch_df_scored)
                
                # Save intermediate results
                intermediate_file = OUT_DIR / f"batch_{batch_num}_scored.csv"
                batch_df_scored.to_csv(intermediate_file, index=False)
                print(f"   💾 Saved to {intermediate_file.name}")
                
            else:
                print(f"   ❌ Error fetching results: HTTP {response.status_code}")
                
    except Exception as e:
        print(f"   ❌ Batch processing error: {str(e)}")
        continue
    
    # Break between batches
    if batch_num < len(batches):
        print(f"\n⏸️  Waiting 30 seconds before next batch...")
        time.sleep(30)

# ═══════════════════════════════════════════════════════════════════════
# COMBINE RESULTS AND SAVE
# ═══════════════════════════════════════════════════════════════════════

print(f"\n{'='*60}")
print("📊 Combining all results...")

# Combine all batches
if all_results:
    final_df = pd.concat(all_results, ignore_index=True)
    
    # Sort by original order
    final_df = final_df.sort_values('global_idx')
    
    # Drop processing columns
    columns_to_drop = ['batch_num', 'batch_idx', 'global_idx']
    final_df = final_df.drop(columns=[col for col in columns_to_drop if col in final_df.columns])
    
    # Save final results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_output = OUT_DIR / f"telegram_scored_{MODEL}_{timestamp}.csv"
    final_df.to_csv(final_output, index=False)
    
    print(f"\n✅ SCORING COMPLETE!")
    print(f"   Total messages: {len(df):,}")
    print(f"   Successfully scored: {total_processed:,}")
    print(f"   Failed: {len(df) - total_processed:,}")
    print(f"   Success rate: {total_processed/len(df)*100:.1f}%")
    print(f"   Total errors: {total_errors}")
    
    # Calculate statistics by category
    print("\n📊 SCORE DISTRIBUTIONS BY CATEGORY:")
    print("-" * 50)
    
    for category in final_df['channel_category'].unique():
        cat_df = final_df[final_df['channel_category'] == category]
        n_scored = cat_df['escalation_score'].notna().sum()
        
        if n_scored > 0:
            print(f"\n{category} ({n_scored:,} messages):")
            
            # Escalation
            esc_mean = cat_df['escalation_score'].mean()
            esc_std = cat_df['escalation_score'].std()
            print(f"   Escalation: {esc_mean:.2f} ± {esc_std:.2f}")
            
            # Blame
            blame_counts = cat_df['blame_direction'].value_counts().sort_index()
            blame_dict = {-1: "Neutral", 0: "Blames West", 1: "Blames Russia"}
            print("   Blame direction:")
            for val, count in blame_counts.items():
                if pd.notna(val):
                    label = blame_dict.get(int(val), val)
                    pct = count / n_scored * 100
                    print(f"      {label}: {count:,} ({pct:.1f}%)")
            
            # Propaganda
            prop_mean = cat_df['propaganda_level'].mean()
            print(f"   Propaganda level: {prop_mean:.2f}/3.0")
            
            # CTA
            cta_pct = cat_df['has_cta'].sum() / n_scored * 100
            print(f"   Has call-to-action: {cta_pct:.1f}%")
    
    # Overall statistics
    print(f"\n📊 OVERALL STATISTICS:")
    print("-" * 50)
    overall_esc = final_df['escalation_score'].mean()
    print(f"Average escalation: {overall_esc:.2f}")
    
    # Compare pro-Russian vs pro-Ukrainian
    pro_ru = final_df[final_df['channel_category'] == 'pro_russian_grassroots']['escalation_score'].mean()
    pro_ua = final_df[final_df['channel_category'] == 'pro_ukrainian_grassroots']['escalation_score'].mean()
    
    if pd.notna(pro_ru) and pd.notna(pro_ua):
        print(f"\nPro-Russian avg escalation: {pro_ru:.2f}")
        print(f"Pro-Ukrainian avg escalation: {pro_ua:.2f}")
        print(f"Difference: {abs(pro_ru - pro_ua):.2f}")
    
    print(f"\n📁 Final results saved to: {final_output}")
    
else:
    print("❌ No results to combine!")

print("\n✨ Ready for analysis!")

### Detect Complete + Resume Batch Scoring Telegram

In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  SCORE Telegram Messages - Anthropic Batch API (RESUME VERSION)       ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, time, pandas as pd, tqdm, re
import anthropic
from dotenv import load_dotenv
import os
import requests
import logging
from datetime import datetime
import numpy as np

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
TELEGRAM_CSV = ROOT / "outputs" / "telegram_full_20250605_213258.csv"
OUT_DIR = ROOT / "outputs" / "telegram_scoring"
OUT_DIR.mkdir(exist_ok=True)

# Model selection
MODEL = "claude-sonnet-4-20250514"

# Batch configuration
BATCH_SIZE = 15000
MAX_RETRIES = 3

# ═══════════════════════════════════════════════════════════════════════
# CHECK FOR EXISTING PROGRESS
# ═══════════════════════════════════════════════════════════════════════

print("🔍 Checking for existing progress...")
existing_batches = []
for i in range(1, 100):  # Check up to 100 batches
    batch_file = OUT_DIR / f"batch_{i}_scored.csv"
    if batch_file.exists():
        existing_batches.append(i)
        print(f"   ✅ Found completed batch {i}")
    else:
        break

if existing_batches:
    print(f"\n📂 Found {len(existing_batches)} completed batches")
    start_from_batch = max(existing_batches) + 1
    print(f"   Will resume from batch {start_from_batch}")
else:
    start_from_batch = 1
    print("   Starting fresh from batch 1")

# ═══════════════════════════════════════════════════════════════════════
# SCORING RUBRIC - Same as before
# ═══════════════════════════════════════════════════════════════════════

TELEGRAM_PROMPT = """You are analyzing Telegram messages about the Russia-Ukraine war.
Score each message on FOUR dimensions:

1. Escalation (0-10): How militarily escalatory is the message?
2. Blame (-1, 0, 1): Who does the message blame?
   - 0 = Blames Ukraine/NATO/West
   - 1 = Blames Russia/Putin  
   - -1 = No clear blame/neutral reporting
3. Propaganda (0-3): Level of propaganda/manipulation
   - 0 = Factual reporting
   - 1 = Mild spin/framing
   - 2 = Clear propaganda
   - 3 = Extreme propaganda/disinformation
4. Call-to-Action (0, 1): Does it urge concrete action?
   - 1 = Yes (donate, volunteer, protest, etc.)
   - 0 = No explicit call

Escalation scale reference:
0: Humanitarian/diplomatic content
1: Peace negotiation focus  
2: Political warfare/disinformation
3: Routine warfare updates
4: Major aid package discussions
5: Major weapons systems
6: Advanced weapons escalation
7: Nation-wide strikes
8: Strategic infrastructure attacks
9: Nuclear facility/weapons threats
10: Direct nuclear war rhetoric

---

# Real message examples (message → E,B,P,C):

- "Гуманитарная помощь доставлена в освобожденные районы"                                → 0,-1,0,0
- "Zelensky's advisor confirms Ukraine ready for talks: "We're not against negotiations, but they must be based on international law and Ukraine's sovereignty."" → 1,1,0,0
- "🇺🇸🇺🇦The United States will be transferring Patriot air defense systems to Ukraine, — Kirby
The White House confirmed that Patriots are included in the aid package that Biden will announce." → 5,1,0,0
- "🇬🇧🇺🇦 Great Britain secretly supplied Ukraine with Storm Shadow cruise missiles, writes CNN.
According to the publication, the deliveries took place a few weeks ago. The UK has asked Ukraine not to use missiles to attack facilities on the territory of the Russian Federation.
It is noted that Storm Shadow has a range of more than 250 km." → 6,1,1,0
- "Biden pushing us to WW3, must negotiate NOW!"                                         → 8,0,2,1  
- "Российские войска нанесли удар по складу ВСУ в Харькове"                             → 3,1,1,0
- "⚡️BREAKING: Nuclear plant under attack! NATO must act!"                               → 9,0,3,1
- "Сегодня ВСУ обстреляли жилые кварталы Донецка. 5 погибших."                          → 4,0,2,0
- "Отчет с передовой: позиции удерживаются, враг несет потери"                          → 3,-1,1,0
- "DONATE NOW to help Ukrainian defenders! Every dollar saves lives!"                     → 2,1,1,1
- "Путин угрожает ядерным оружием если НАТО вмешается"                                  → 10,1,2,0
- "💥💥💥 Explosions all over Ukraine

Explosions in:
Vinnytsia;
The Vinnitsa region;
Dnipro;
Zaporizhia;
Kirovograd;
Kharkov;
Odessa;
N.....

Air raid throughout Ukraine." → 7,0,1,0

---

CRITICAL: Respond ONLY with four integers in format E,B,P,C
No spaces, no explanations - just four numbers with three commas.
Example: 5,0,2,1"""

# ═══════════════════════════════════════════════════════════════════════
# LOAD AND PREPARE DATA
# ═══════════════════════════════════════════════════════════════════════

print("\n📊 Loading Telegram data...")
df = pd.read_csv(TELEGRAM_CSV)
print(f"✅ Loaded {len(df):,} messages")

# Data validation
df = df[df['message_text'].notna()].copy()
df = df[df['message_text'].str.strip() != ''].copy()
print(f"✅ {len(df):,} messages with valid text")

# Add tracking index
df['global_idx'] = range(len(df))

# Show category distribution
print("\n📈 Message distribution by category:")
category_counts = df['channel_category'].value_counts()
for cat, count in category_counts.items():
    print(f"   {cat}: {count:,} ({count/len(df)*100:.1f}%)")

# ═══════════════════════════════════════════════════════════════════════
# BATCH PROCESSING FUNCTIONS (same as before)
# ═══════════════════════════════════════════════════════════════════════

def create_batches(df, batch_size=BATCH_SIZE):
    """Split dataframe into batches"""
    n_batches = (len(df) + batch_size - 1) // batch_size
    batches = []
    
    for i in range(n_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch_df = df.iloc[start_idx:end_idx].copy()
        batch_df['batch_num'] = i + 1
        batch_df['batch_idx'] = range(len(batch_df))
        batches.append(batch_df)
    
    return batches

def prepare_batch_requests(batch_df):
    """Prepare requests for a batch"""
    requests_list = []
    
    for _, row in batch_df.iterrows():
        # Truncate very long messages
        text = str(row['message_text'])[:1500]
        
        # Add channel category context
        context = f"[Channel: {row['channel_username']} ({row['channel_category']})]\\n{text}"
        
        request = {
            "custom_id": str(row['batch_idx']),
            "params": {
                "model": MODEL,
                "max_tokens": 15,
                "temperature": 0,
                "system": TELEGRAM_PROMPT,
                "messages": [
                    {"role": "user", "content": context}
                ]
            }
        }
        requests_list.append(request)
    
    return requests_list

def process_batch_results(batch_df, results_data):
    """Parse results and add to dataframe"""
    scores = {
        "escalation": {},
        "blame": {},
        "propaganda": {},
        "cta": {}
    }
    parse_errors = []
    
    for line in results_data:
        if not line:
            continue
            
        try:
            result = json.loads(line)
            custom_id = result.get("custom_id")
            
            if custom_id is None:
                continue
                
            idx = int(custom_id)
            
            # Check if request succeeded
            if result.get("result", {}).get("type") != "succeeded":
                parse_errors.append(f"Request {custom_id} failed")
                continue
            
            # Extract response
            message_content = result["result"]["message"]["content"][0]["text"].strip()
            
            # Parse E,B,P,C format
            match = re.match(r'^(\d+),(-?\d+),(\d+),(\d+)', message_content)
            
            if match:
                e = int(match.group(1))
                b = int(match.group(2))
                p = int(match.group(3))
                c = int(match.group(4))
                
                # Validate ranges
                if 0 <= e <= 10:
                    scores["escalation"][idx] = e
                if b in (-1, 0, 1):
                    scores["blame"][idx] = b
                if 0 <= p <= 3:
                    scores["propaganda"][idx] = p
                if c in (0, 1):
                    scores["cta"][idx] = c
            else:
                parse_errors.append(f"Parse error for {custom_id}: {message_content}")
                
        except Exception as e:
            parse_errors.append(f"Error: {str(e)}")
    
    # Map scores to dataframe
    batch_df["escalation_score"] = batch_df["batch_idx"].map(scores["escalation"]).astype("Int64")
    batch_df["blame_direction"] = batch_df["batch_idx"].map(scores["blame"]).astype("Int64")
    batch_df["propaganda_level"] = batch_df["batch_idx"].map(scores["propaganda"]).astype("Int64")
    batch_df["has_cta"] = batch_df["batch_idx"].map(scores["cta"]).astype("Int64")
    
    return batch_df, len(scores["escalation"]), parse_errors

# ═══════════════════════════════════════════════════════════════════════
# MAIN PROCESSING LOOP (MODIFIED FOR RESUME)
# ═══════════════════════════════════════════════════════════════════════

# Initialize Anthropic client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Quiet logging
for name in ("httpx", "anthropic"):
    logging.getLogger(name).setLevel(logging.WARNING)

# Create batches
batches = create_batches(df)
print(f"\n🔄 Created {len(batches)} batches total")

# Load already processed results
all_results = []
total_processed = 0
total_errors = 0

# Load existing batch results
if existing_batches:
    print("\n📥 Loading existing batch results...")
    for batch_num in existing_batches:
        batch_file = OUT_DIR / f"batch_{batch_num}_scored.csv"
        batch_df = pd.read_csv(batch_file)
        all_results.append(batch_df)
        n_scored = batch_df['escalation_score'].notna().sum()
        total_processed += n_scored
        print(f"   Loaded batch {batch_num}: {n_scored:,} scored messages")

print(f"\n🚀 Resuming from batch {start_from_batch}/{len(batches)}")

# Process remaining batches
for batch_num, batch_df in enumerate(batches, 1):
    # Skip already processed batches
    if batch_num < start_from_batch:
        continue
        
    print(f"\n{'='*60}")
    print(f"📦 Processing Batch {batch_num}/{len(batches)}")
    print(f"   Messages: {len(batch_df):,}")
    print(f"   Categories: {batch_df['channel_category'].value_counts().to_dict()}")
    
    # Prepare requests
    requests_list = prepare_batch_requests(batch_df)
    print(f"   Prepared {len(requests_list)} requests")
    
    # Create Anthropic batch
    try:
        batch = client.messages.batches.create(requests=requests_list)
        print(f"   🚀 Launched batch {batch.id}")
        
        # Monitor progress
        bar = tqdm.tqdm(total=len(requests_list), desc=f"Batch {batch_num}", unit="msg")
        start_time = time.time()
        
        while True:
            batch_status = client.messages.batches.retrieve(batch.id)
            completed = (batch_status.request_counts.succeeded + 
                        batch_status.request_counts.errored + 
                        batch_status.request_counts.canceled + 
                        batch_status.request_counts.expired)
            bar.n = completed
            bar.refresh()
            
            if batch_status.processing_status == "ended":
                bar.close()
                break
                
            time.sleep(5)
        
        elapsed_time = time.time() - start_time
        print(f"   ✅ Batch complete in {elapsed_time/60:.1f} minutes")
        
        # Retrieve results
        batch_final = client.messages.batches.retrieve(batch.id)
        
        if batch_final.results_url:
            headers = {
                "x-api-key": os.getenv("ANTHROPIC_API_KEY"),
                "anthropic-version": "2023-06-01"
            }
            
            response = requests.get(batch_final.results_url, headers=headers, stream=True)
            
            if response.status_code == 200:
                results_data = [line.decode('utf-8') for line in response.iter_lines()]
                
                # Process results
                batch_df_scored, n_success, errors = process_batch_results(batch_df, results_data)
                
                print(f"   ✅ Scored {n_success:,}/{len(batch_df):,} messages")
                if errors:
                    print(f"   ⚠️  {len(errors)} parse errors")
                    total_errors += len(errors)
                
                total_processed += n_success
                all_results.append(batch_df_scored)
                
                # Save intermediate results
                intermediate_file = OUT_DIR / f"batch_{batch_num}_scored.csv"
                batch_df_scored.to_csv(intermediate_file, index=False)
                print(f"   💾 Saved to {intermediate_file.name}")
                
            else:
                print(f"   ❌ Error fetching results: HTTP {response.status_code}")
                
    except Exception as e:
        print(f"   ❌ Batch processing error: {str(e)}")
        continue
    
    # Break between batches
    if batch_num < len(batches):
        print(f"\n⏸️  Waiting 30 seconds before next batch...")
        time.sleep(30)

# ═══════════════════════════════════════════════════════════════════════
# COMBINE RESULTS AND SAVE (same as before)
# ═══════════════════════════════════════════════════════════════════════

print(f"\n{'='*60}")
print("📊 Combining all results...")

# Combine all batches
if all_results:
    final_df = pd.concat(all_results, ignore_index=True)
    
    # Sort by original order if global_idx exists
    if 'global_idx' in final_df.columns:
        final_df = final_df.sort_values('global_idx')
    
    # Drop processing columns
    columns_to_drop = ['batch_num', 'batch_idx', 'global_idx']
    final_df = final_df.drop(columns=[col for col in columns_to_drop if col in final_df.columns])
    
    # Save final results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_output = OUT_DIR / f"telegram_scored_{MODEL}_{timestamp}.csv"
    final_df.to_csv(final_output, index=False)
    
    print(f"\n✅ SCORING COMPLETE!")
    print(f"   Total messages: {len(df):,}")
    print(f"   Successfully scored: {total_processed:,}")
    print(f"   Failed: {len(df) - total_processed:,}")
    print(f"   Success rate: {total_processed/len(df)*100:.1f}%")
    print(f"   Total errors: {total_errors}")
    
    # Calculate statistics by category
    print("\n📊 SCORE DISTRIBUTIONS BY CATEGORY:")
    print("-" * 50)
    
    for category in final_df['channel_category'].unique():
        cat_df = final_df[final_df['channel_category'] == category]
        n_scored = cat_df['escalation_score'].notna().sum()
        
        if n_scored > 0:
            print(f"\n{category} ({n_scored:,} messages):")
            
            # Escalation
            esc_mean = cat_df['escalation_score'].mean()
            esc_std = cat_df['escalation_score'].std()
            print(f"   Escalation: {esc_mean:.2f} ± {esc_std:.2f}")
            
            # Blame
            blame_counts = cat_df['blame_direction'].value_counts().sort_index()
            blame_dict = {-1: "Neutral", 0: "Blames West", 1: "Blames Russia"}
            print("   Blame direction:")
            for val, count in blame_counts.items():
                if pd.notna(val):
                    label = blame_dict.get(int(val), val)
                    pct = count / n_scored * 100
                    print(f"      {label}: {count:,} ({pct:.1f}%)")
            
            # Propaganda
            prop_mean = cat_df['propaganda_level'].mean()
            print(f"   Propaganda level: {prop_mean:.2f}/3.0")
            
            # CTA
            cta_pct = cat_df['has_cta'].sum() / n_scored * 100
            print(f"   Has call-to-action: {cta_pct:.1f}%")
    
    # Overall statistics
    print(f"\n📊 OVERALL STATISTICS:")
    print("-" * 50)
    overall_esc = final_df['escalation_score'].mean()
    print(f"Average escalation: {overall_esc:.2f}")
    
    # Compare pro-Russian vs pro-Ukrainian
    pro_ru = final_df[final_df['channel_category'] == 'pro_russian_grassroots']['escalation_score'].mean()
    pro_ua = final_df[final_df['channel_category'] == 'pro_ukrainian_grassroots']['escalation_score'].mean()
    
    if pd.notna(pro_ru) and pd.notna(pro_ua):
        print(f"\nPro-Russian avg escalation: {pro_ru:.2f}")
        print(f"Pro-Ukrainian avg escalation: {pro_ua:.2f}")
        print(f"Difference: {abs(pro_ru - pro_ua):.2f}")
    
    print(f"\n📁 Final results saved to: {final_output}")
    
else:
    print("❌ No results to combine!")

print("\n✨ Ready for analysis!")

### Retrieve Partial Batch 11

In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  RECOVER AND RESUME - Anthropic Batch Processing (FIXED)              ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, time, pandas as pd, re, tqdm
import anthropic
from dotenv import load_dotenv
import os
import requests
from datetime import datetime
import numpy as np

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
TELEGRAM_CSV = ROOT / "outputs" / "telegram_full_20250605_213258.csv"
OUT_DIR = ROOT / "outputs" / "telegram_scoring"
OUT_DIR.mkdir(exist_ok=True)
MODEL = "claude-sonnet-4-20250514"
BATCH_SIZE = 15000

# Initialize client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# ═══════════════════════════════════════════════════════════════════════
# STEP 1: Find all existing batches
# ═══════════════════════════════════════════════════════════════════════

print("🔍 Scanning for all existing batches...")

# List recent batches
recent_batches = client.messages.batches.list(limit=50)
completed_batches = []

for batch in recent_batches.data:
    if batch.processing_status == "ended" and batch.request_counts.succeeded > 0:
        # created_at is already a datetime object
        completed_batches.append({
            'id': batch.id,
            'succeeded': batch.request_counts.succeeded,
            'failed': batch.request_counts.errored,
            'created': batch.created_at,  # Already a datetime
            'results_url': getattr(batch, 'results_url', None)
        })
        print(f"   ✅ {batch.id}: {batch.request_counts.succeeded} succeeded")

print(f"\nFound {len(completed_batches)} completed batches")

# ═══════════════════════════════════════════════════════════════════════
# STEP 2: Download any missing batch results
# ═══════════════════════════════════════════════════════════════════════

# Check which batch files we already have
existing_files = list(OUT_DIR.glob("batch_*_scored.csv"))
existing_batch_nums = set()
for f in existing_files:
    match = re.search(r'batch_(\d+)_scored\.csv', f.name)
    if match:
        existing_batch_nums.add(int(match.group(1)))

print(f"\n📂 Already have batch files: {sorted(existing_batch_nums)}")

# Known batch IDs from your output
known_batch_ids = {
    10: "msgbatch_01AZ767eqYBQuRrzN2PidyaC",  # The one in your screenshot
    11: "msgbatch_01GFn7vV3JzBQaXeTRW3m1gv"   # The one that was processing
}

# Try to download any missing batch results
for batch_num, batch_id in known_batch_ids.items():
    if batch_num not in existing_batch_nums:
        print(f"\n📥 Checking batch {batch_num} ({batch_id})...")
        try:
            batch_status = client.messages.batches.retrieve(batch_id)
            print(f"   Status: {batch_status.processing_status}")
            print(f"   Succeeded: {batch_status.request_counts.succeeded}")
            
            if batch_status.results_url and batch_status.request_counts.succeeded > 0:
                print("   📥 Downloading results...")
                headers = {
                    "x-api-key": os.getenv("ANTHROPIC_API_KEY"),
                    "anthropic-version": "2023-06-01"
                }
                response = requests.get(batch_status.results_url, headers=headers)
                
                if response.status_code == 200:
                    # Save raw results
                    raw_file = OUT_DIR / f"batch_{batch_num}_raw_results.jsonl"
                    with open(raw_file, 'wb') as f:
                        f.write(response.content)
                    print(f"   💾 Saved raw results to {raw_file}")
                else:
                    print(f"   ❌ Error downloading: HTTP {response.status_code}")
        except Exception as e:
            print(f"   ❌ Error retrieving batch: {str(e)}")

# ═══════════════════════════════════════════════════════════════════════
# STEP 3: Load original data
# ═══════════════════════════════════════════════════════════════════════

print("\n📊 Loading original Telegram data...")
df = pd.read_csv(TELEGRAM_CSV)
df = df[df['message_text'].notna()].copy()
df = df[df['message_text'].str.strip() != ''].copy()
df['global_idx'] = range(len(df))
print(f"✅ Loaded {len(df):,} messages")

# ═══════════════════════════════════════════════════════════════════════
# STEP 4: Process any raw batch results
# ═══════════════════════════════════════════════════════════════════════

def process_raw_batch_file(batch_num, df_full):
    """Process a raw batch results file"""
    raw_file = OUT_DIR / f"batch_{batch_num}_raw_results.jsonl"
    if not raw_file.exists():
        return None
        
    print(f"\n📊 Processing raw results for batch {batch_num}...")
    
    # Get the slice of data for this batch
    start_idx = (batch_num - 1) * BATCH_SIZE
    end_idx = min(batch_num * BATCH_SIZE, len(df_full))
    batch_df = df_full.iloc[start_idx:end_idx].copy()
    batch_df['batch_idx'] = range(len(batch_df))
    batch_df['batch_num'] = batch_num
    batch_df['global_idx'] = range(start_idx, end_idx)
    
    # Process results
    scores = {
        "escalation": {},
        "blame": {},
        "propaganda": {},
        "cta": {}
    }
    parse_errors = 0
    
    with open(raw_file, 'r') as f:
        for line in f:
            if not line.strip():
                continue
            try:
                result = json.loads(line)
                custom_id = result.get("custom_id")
                if custom_id is None:
                    continue
                    
                idx = int(custom_id)
                
                if result.get("result", {}).get("type") != "succeeded":
                    parse_errors += 1
                    continue
                
                message_content = result["result"]["message"]["content"][0]["text"].strip()
                match = re.match(r'^(\d+),(-?\d+),(\d+),(\d+)', message_content)
                
                if match:
                    scores["escalation"][idx] = int(match.group(1))
                    scores["blame"][idx] = int(match.group(2))
                    scores["propaganda"][idx] = int(match.group(3))
                    scores["cta"][idx] = int(match.group(4))
                else:
                    parse_errors += 1
                    
            except Exception as e:
                parse_errors += 1
                continue
    
    # Map scores to dataframe
    batch_df["escalation_score"] = batch_df["batch_idx"].map(scores["escalation"]).astype("Int64")
    batch_df["blame_direction"] = batch_df["batch_idx"].map(scores["blame"]).astype("Int64")
    batch_df["propaganda_level"] = batch_df["batch_idx"].map(scores["propaganda"]).astype("Int64")
    batch_df["has_cta"] = batch_df["batch_idx"].map(scores["cta"]).astype("Int64")
    
    # Save processed results
    output_file = OUT_DIR / f"batch_{batch_num}_scored.csv"
    batch_df.to_csv(output_file, index=False)
    print(f"   ✅ Saved {len(scores['escalation'])} scored messages to {output_file}")
    if parse_errors > 0:
        print(f"   ⚠️  {parse_errors} messages failed to parse")
    
    return batch_df

# Process any raw batch files
for batch_num in range(1, 13):
    raw_file = OUT_DIR / f"batch_{batch_num}_raw_results.jsonl"
    scored_file = OUT_DIR / f"batch_{batch_num}_scored.csv"
    if raw_file.exists() and not scored_file.exists():
        process_raw_batch_file(batch_num, df)

# ═══════════════════════════════════════════════════════════════════════
# STEP 5: Combine all existing results
# ═══════════════════════════════════════════════════════════════════════

print("\n📊 Combining all existing results...")
all_results = []
total_scored = 0
batch_coverage = {}

for batch_num in range(1, 13):  # Check batches 1-12
    batch_file = OUT_DIR / f"batch_{batch_num}_scored.csv"
    if batch_file.exists():
        batch_df = pd.read_csv(batch_file)
        all_results.append(batch_df)
        n_scored = batch_df['escalation_score'].notna().sum()
        total_scored += n_scored
        batch_coverage[batch_num] = n_scored
        print(f"   Batch {batch_num}: {n_scored:,} messages")

print(f"\n✅ Total scored so far: {total_scored:,} / {len(df):,}")
remaining = len(df) - total_scored
print(f"   Remaining to score: {remaining:,}")

# ═══════════════════════════════════════════════════════════════════════
# STEP 6: Create final output and identify gaps
# ═══════════════════════════════════════════════════════════════════════

if all_results:
    # Combine all results
    combined_df = pd.concat(all_results, ignore_index=True)
    
    # Sort and clean
    if 'global_idx' in combined_df.columns:
        combined_df = combined_df.sort_values('global_idx')
    
    # Drop processing columns
    columns_to_drop = ['batch_num', 'batch_idx', 'global_idx']
    combined_df = combined_df.drop(columns=[col for col in columns_to_drop if col in combined_df.columns], errors='ignore')
    
    # Save combined results
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    final_output = OUT_DIR / f"telegram_scored_COMBINED_{timestamp}.csv"
    combined_df.to_csv(final_output, index=False)
    print(f"\n💾 Saved combined results to: {final_output}")
    
    # Calculate statistics
    print("\n📊 SCORE DISTRIBUTIONS BY CATEGORY:")
    print("-" * 50)
    
    for category in combined_df['channel_category'].unique():
        cat_df = combined_df[combined_df['channel_category'] == category]
        n_scored = cat_df['escalation_score'].notna().sum()
        
        if n_scored > 0:
            print(f"\n{category} ({n_scored:,} messages):")
            esc_mean = cat_df['escalation_score'].mean()
            esc_std = cat_df['escalation_score'].std()
            print(f"   Escalation: {esc_mean:.2f} ± {esc_std:.2f}")
            
            # Propaganda
            prop_mean = cat_df['propaganda_level'].mean()
            print(f"   Propaganda level: {prop_mean:.2f}/3.0")
            
            # CTA
            cta_pct = cat_df['has_cta'].sum() / n_scored * 100
            print(f"   Has call-to-action: {cta_pct:.1f}%")

# Identify missing batches
print(f"\n📌 BATCH SUMMARY:")
for i in range(1, 13):
    if i in batch_coverage:
        expected = BATCH_SIZE if i < 12 else 9862
        actual = batch_coverage[i]
        if actual < expected:
            print(f"   Batch {i}: {actual:,}/{expected:,} ⚠️  INCOMPLETE")
        else:
            print(f"   Batch {i}: {actual:,}/{expected:,} ✅")
    else:
        expected = BATCH_SIZE if i < 12 else 9862
        print(f"   Batch {i}: 0/{expected:,} ❌ MISSING")

if remaining > 0:
    print(f"\n💡 NEXT STEPS:")
    print(f"   - You need to score {remaining:,} more messages")
    print(f"   - Consider using the OpenAI script for faster processing")
    print(f"   - Or create smaller Anthropic batches (5k messages each)")

In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  SCORE REMAINING MESSAGES - Single Anthropic Batch                    ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import json, time, pandas as pd, tqdm, re
import anthropic
from dotenv import load_dotenv
import os
import requests
from datetime import datetime
import numpy as np

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
TELEGRAM_CSV = ROOT / "outputs" / "telegram_full_20250605_213258.csv"
OUT_DIR = ROOT / "outputs" / "telegram_scoring"
MODEL = "claude-sonnet-4-20250514"

# ═══════════════════════════════════════════════════════════════════════
# SCORING PROMPT - PASTE YOUR PROMPT HERE
# ═══════════════════════════════════════════════════════════════════════

TELEGRAM_PROMPT = """You are analyzing Telegram messages about the Russia-Ukraine war.
Score each message on FOUR dimensions:

1. Escalation (0-10): How militarily escalatory is the message?
2. Blame (-1, 0, 1): Who does the message blame?
   - 0 = Blames Ukraine/NATO/West
   - 1 = Blames Russia/Putin  
   - -1 = No clear blame/neutral reporting
3. Propaganda (0-3): Level of propaganda/manipulation
   - 0 = Factual reporting
   - 1 = Mild spin/framing
   - 2 = Clear propaganda
   - 3 = Extreme propaganda/disinformation
4. Call-to-Action (0, 1): Does it urge concrete action?
   - 1 = Yes (donate, volunteer, protest, etc.)
   - 0 = No explicit call

Escalation scale reference:
0: Humanitarian/diplomatic content
1: Peace negotiation focus  
2: Political warfare/disinformation
3: Routine warfare updates
4: Major aid package discussions
5: Major weapons systems
6: Advanced weapons escalation
7: Nation-wide strikes
8: Strategic infrastructure attacks
9: Nuclear facility/weapons threats
10: Direct nuclear war rhetoric

---

# Real message examples (message → E,B,P,C):

- "Гуманитарная помощь доставлена в освобожденные районы"                                → 0,-1,0,0
- "Zelensky's advisor confirms Ukraine ready for talks: "We're not against negotiations, but they must be based on international law and Ukraine's sovereignty."" → 1,1,0,0
- "🇺🇸🇺🇦The United States will be transferring Patriot air defense systems to Ukraine, — Kirby
The White House confirmed that Patriots are included in the aid package that Biden will announce." → 5,1,0,0
- "🇬🇧🇺🇦 Great Britain secretly supplied Ukraine with Storm Shadow cruise missiles, writes CNN.
According to the publication, the deliveries took place a few weeks ago. The UK has asked Ukraine not to use missiles to attack facilities on the territory of the Russian Federation.
It is noted that Storm Shadow has a range of more than 250 km." → 6,1,1,0
- "Biden pushing us to WW3, must negotiate NOW!"                                         → 8,0,2,1  
- "Российские войска нанесли удар по складу ВСУ в Харькове"                             → 3,1,1,0
- "⚡️BREAKING: Nuclear plant under attack! NATO must act!"                               → 9,0,3,1
- "Сегодня ВСУ обстреляли жилые кварталы Донецка. 5 погибших."                          → 4,0,2,0
- "Отчет с передовой: позиции удерживаются, враг несет потери"                          → 3,-1,1,0
- "DONATE NOW to help Ukrainian defenders! Every dollar saves lives!"                     → 2,1,1,1
- "Путин угрожает ядерным оружием если НАТО вмешается"                                  → 10,1,2,0
- "💥💥💥 Explosions all over Ukraine

Explosions in:
Vinnytsia;
The Vinnitsa region;
Dnipro;
Zaporizhia;
Kirovograd;
Kharkov;
Odessa;
N.....

Air raid throughout Ukraine." → 7,0,1,0

---

CRITICAL: Respond ONLY with four integers in format E,B,P,C
No spaces, no explanations - just four numbers with three commas.
Example: 5,0,2,1"""

# ═══════════════════════════════════════════════════════════════════════
# LOAD DATA AND FIND UNSCORED MESSAGES
# ═══════════════════════════════════════════════════════════════════════

print("📊 Loading data...")

# Load original data
df_original = pd.read_csv(TELEGRAM_CSV)
df_original = df_original[df_original['message_text'].notna()].copy()
df_original = df_original[df_original['message_text'].str.strip() != ''].copy()
df_original['original_idx'] = range(len(df_original))
print(f"✅ Total messages: {len(df_original):,}")

# Load already scored data
scored_file = sorted(OUT_DIR.glob("telegram_scored_COMBINED_*.csv"))[-1]  # Get most recent
df_scored = pd.read_csv(scored_file)
print(f"✅ Already scored: {len(df_scored):,}")

# Find unscored messages
# We need to identify which original messages haven't been scored
# This is tricky because we need to match on content, not just index

# Create a unique identifier for matching
def create_message_id(row):
    # Use first 100 chars of message + channel + date as identifier
    msg = str(row['message_text'])[:100]
    channel = str(row.get('channel_username', ''))
    date = str(row.get('date', ''))
    return f"{msg}|{channel}|{date}"

df_original['msg_id'] = df_original.apply(create_message_id, axis=1)
df_scored['msg_id'] = df_scored.apply(create_message_id, axis=1)

# Find unscored messages
scored_ids = set(df_scored['msg_id'])
df_unscored = df_original[~df_original['msg_id'].isin(scored_ids)].copy()

print(f"\n📊 Messages to score: {len(df_unscored):,}")
print(f"   Expected: 25,472")
print(f"   Actual: {len(df_unscored):,}")

if len(df_unscored) == 0:
    print("\n✅ All messages already scored!")
    exit()

# Show category breakdown
print("\n📈 Unscored messages by category:")
for cat, count in df_unscored['channel_category'].value_counts().items():
    print(f"   {cat}: {count:,}")

# ═══════════════════════════════════════════════════════════════════════
# PREPARE BATCH REQUESTS
# ═══════════════════════════════════════════════════════════════════════

print("\n🚀 Preparing batch requests...")

# Reset index for batch processing
df_unscored = df_unscored.reset_index(drop=True)
df_unscored['batch_idx'] = range(len(df_unscored))

requests_list = []
for _, row in df_unscored.iterrows():
    # Truncate very long messages
    text = str(row['message_text'])[:1500]
    
    # Add channel category context
    context = f"[Channel: {row['channel_username']} ({row['channel_category']})]\\n{text}"
    
    request = {
        "custom_id": str(row['batch_idx']),
        "params": {
            "model": MODEL,
            "max_tokens": 15,
            "temperature": 0,
            "system": TELEGRAM_PROMPT,
            "messages": [
                {"role": "user", "content": context}
            ]
        }
    }
    requests_list.append(request)

print(f"✅ Prepared {len(requests_list):,} requests")

# ═══════════════════════════════════════════════════════════════════════
# SEND TO ANTHROPIC
# ═══════════════════════════════════════════════════════════════════════

# Initialize client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

print("\n🚀 Creating Anthropic batch...")
print(f"   Model: {MODEL}")
print(f"   Messages: {len(requests_list):,}")

# Create batch
try:
    batch = client.messages.batches.create(requests=requests_list)
    print(f"\n✅ Batch created successfully!")
    print(f"   Batch ID: {batch.id}")
    print(f"   Status: {batch.processing_status}")
    
    # Save batch info
    batch_info = {
        'batch_id': batch.id,
        'created_at': str(datetime.now()),
        'n_messages': len(requests_list),
        'model': MODEL
    }
    
    with open(OUT_DIR / 'final_batch_info.json', 'w') as f:
        json.dump(batch_info, f, indent=2)
    
    print("\n📄 Batch info saved to final_batch_info.json")
    
    # Monitor progress
    print("\n⏳ Monitoring batch progress...")
    print("   This may take 10-30 minutes for 25k messages")
    
    bar = tqdm.tqdm(total=len(requests_list), desc="Processing", unit="msg")
    start_time = time.time()
    
    while True:
        batch_status = client.messages.batches.retrieve(batch.id)
        completed = (batch_status.request_counts.succeeded + 
                    batch_status.request_counts.errored + 
                    batch_status.request_counts.canceled + 
                    batch_status.request_counts.expired)
        bar.n = completed
        bar.refresh()
        
        # Show detailed status every 30 seconds
        if int(time.time() - start_time) % 30 == 0:
            bar.set_postfix({
                'succeeded': batch_status.request_counts.succeeded,
                'errors': batch_status.request_counts.errored,
                'status': batch_status.processing_status
            })
        
        if batch_status.processing_status == "ended":
            bar.close()
            break
            
        time.sleep(5)
    
    elapsed_time = time.time() - start_time
    print(f"\n✅ Batch complete in {elapsed_time/60:.1f} minutes")
    print(f"   Succeeded: {batch_status.request_counts.succeeded:,}")
    print(f"   Failed: {batch_status.request_counts.errored:,}")
    
    # Retrieve and save results
    if batch_status.results_url:
        print("\n📥 Downloading results...")
        headers = {
            "x-api-key": os.getenv("ANTHROPIC_API_KEY"),
            "anthropic-version": "2023-06-01"
        }
        
        response = requests.get(batch_status.results_url, headers=headers, stream=True)
        
        if response.status_code == 200:
            # Save raw results
            raw_file = OUT_DIR / "final_batch_raw_results.jsonl"
            with open(raw_file, 'wb') as f:
                f.write(response.content)
            print(f"   💾 Saved raw results to {raw_file}")
            
            # Process results
            print("\n📊 Processing results...")
            from collections import defaultdict
            scores = defaultdict(dict)
            parse_errors = []
            
            with open(raw_file, 'r') as f:
                for line in f:
                    if not line.strip():
                        continue
                    try:
                        result = json.loads(line)
                        custom_id = result.get("custom_id")
                        if custom_id is None:
                            continue
                            
                        idx = int(custom_id)
                        
                        if result.get("result", {}).get("type") != "succeeded":
                            parse_errors.append(f"Request {custom_id} failed")
                            continue
                        
                        message_content = result["result"]["message"]["content"][0]["text"].strip()
                        match = re.match(r'^(\d+),(-?\d+),(\d+),(\d+)', message_content)
                        
                        if match:
                            scores["escalation_score"][idx] = int(match.group(1))
                            scores["blame_direction"][idx] = int(match.group(2))
                            scores["propaganda_level"][idx] = int(match.group(3))
                            scores["has_cta"][idx] = int(match.group(4))
                        else:
                            parse_errors.append(f"Parse error: {message_content}")
                            
                    except Exception as e:
                        parse_errors.append(f"Error: {str(e)}")
            
            # Map scores to dataframe
            for col in ['escalation_score', 'blame_direction', 'propaganda_level', 'has_cta']:
                df_unscored[col] = df_unscored['batch_idx'].map(scores.get(col, {})).astype("Int64")
            
            # Save final scored data
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            final_file = OUT_DIR / f"telegram_remaining_scored_{timestamp}.csv"
            df_unscored.drop(columns=['batch_idx', 'original_idx', 'msg_id'], inplace=True)
            df_unscored.to_csv(final_file, index=False)
            
            print(f"\n✅ SCORING COMPLETE!")
            print(f"   Successfully scored: {len(scores['escalation_score']):,}")
            print(f"   Parse errors: {len(parse_errors)}")
            print(f"   Saved to: {final_file}")
            
            # Combine with existing results
            print("\n📊 Creating final combined file...")
            df_combined = pd.concat([df_scored, df_unscored], ignore_index=True)
            final_combined = OUT_DIR / f"telegram_FINAL_COMPLETE_{timestamp}.csv"
            df_combined.to_csv(final_combined, index=False)
            print(f"   💾 Complete dataset saved to: {final_combined}")
            print(f"   Total messages: {len(df_combined):,}")
            
        else:
            print(f"   ❌ Error downloading results: HTTP {response.status_code}")
    else:
        print("   ❌ No results URL available")
        
except Exception as e:
    print(f"\n❌ Error creating batch: {str(e)}")
    print("\nPossible reasons:")
    print("- Daily batch limit reached")
    print("- API key issues")
    print("- Network problems")
    print("\nTry again later or use smaller batches")

In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  FAST CONCURRENT VERSION - Like your working script                    ║
# ╚══════════════════════════════════════════════════════════════════════╝
from pathlib import Path
import pandas as pd
import anthropic
from dotenv import load_dotenv
import os
from datetime import datetime
import time
import re
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

load_dotenv()

# Configuration
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
TELEGRAM_CSV = ROOT / "outputs" / "telegram_full_20250605_213258.csv"
OUT_DIR = ROOT / "outputs" / "telegram_scoring"
MODEL = "claude-sonnet-4-20250514"

# Match your working script
MAX_WORKERS = 10  # Same as your fast script

# Your prompt
TELEGRAM_PROMPT = """You are analyzing Telegram messages about the Russia-Ukraine war.
Score each message on FOUR dimensions:

1. Escalation (0-10): How militarily escalatory is the message?
2. Blame (-1, 0, 1): Who does the message blame?
   - 0 = Blames Ukraine/NATO/West
   - 1 = Blames Russia/Putin  
   - -1 = No clear blame/neutral reporting
3. Propaganda (0-3): Level of propaganda/manipulation
   - 0 = Factual reporting
   - 1 = Mild spin/framing
   - 2 = Clear propaganda
   - 3 = Extreme propaganda/disinformation
4. Call-to-Action (0, 1): Does it urge concrete action?
   - 1 = Yes (donate, volunteer, protest, etc.)
   - 0 = No explicit call

Escalation scale reference:
0: Humanitarian/diplomatic content
1: Peace negotiation focus  
2: Political warfare/disinformation
3: Routine warfare updates
4: Major aid package discussions
5: Major weapons systems
6: Advanced weapons escalation
7: Nation-wide strikes
8: Strategic infrastructure attacks
9: Nuclear facility/weapons threats
10: Direct nuclear war rhetoric

---

# Real message examples (message → E,B,P,C):

- "Гуманитарная помощь доставлена в освобожденные районы"                                → 0,-1,0,0
- "Zelensky's advisor confirms Ukraine ready for talks: "We're not against negotiations, but they must be based on international law and Ukraine's sovereignty."" → 1,1,0,0
- "Biden pushing us to WW3, must negotiate NOW!"                                         → 8,0,2,1  
- "Российские войска нанесли удар по складу ВСУ в Харькове"                             → 3,1,1,0
- "⚡️BREAKING: Nuclear plant under attack! NATO must act!"                               → 9,0,3,1
- "Сегодня ВСУ обстреляли жилые кварталы Донецка. 5 погибших."                          → 4,0,2,0
- "Отчет с передовой: позиции удерживаются, враг несет потери"                          → 3,-1,1,0
- "DONATE NOW to help Ukrainian defenders! Every dollar saves lives!"                     → 2,1,1,1
- "Путин угрожает ядерным оружием если НАТО вмешается"                                  → 10,1,2,0

---

CRITICAL: Respond ONLY with four integers in format E,B,P,C
No spaces, no explanations - just four numbers with three commas.
Example: 5,0,2,1"""

print("🚀 Starting FAST Telegram scoring...")
print(f"   Time: {datetime.now().strftime('%H:%M:%S')}")

# Initialize client
client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

# Load data (same as before)
print("\n📊 Loading data...")
df_original = pd.read_csv(TELEGRAM_CSV)
df_original = df_original[df_original['message_text'].notna()].copy()
df_original = df_original[df_original['message_text'].str.strip() != ''].copy()

scored_file = sorted(OUT_DIR.glob("telegram_scored_COMBINED_*.csv"))[-1]
df_scored = pd.read_csv(scored_file)

# Find unscored messages
def create_message_id(row):
    msg = str(row['message_text'])[:100]
    channel = str(row.get('channel_username', ''))
    date = str(row.get('date', ''))
    return f"{msg}|{channel}|{date}"

df_original['msg_id'] = df_original.apply(create_message_id, axis=1)
df_scored['msg_id'] = df_scored.apply(create_message_id, axis=1)

scored_ids = set(df_scored['msg_id'])
df_unscored = df_original[~df_original['msg_id'].isin(scored_ids)].copy()
df_unscored = df_unscored.reset_index(drop=True)

print(f"✅ Found {len(df_unscored):,} messages to score")

# Function to score a single message (like your working script)
def score_message(idx, row):
    """Score a single message"""
    results = {"idx": idx, "scores": None, "error": None}
    
    try:
        text = str(row['message_text'])[:1500]
        context = f"[Channel: {row['channel_username']} ({row['channel_category']})]\\n{text}"
        
        response = client.messages.create(
            model=MODEL,
            max_tokens=15,
            temperature=0,
            system=TELEGRAM_PROMPT,
            messages=[{"role": "user", "content": context}]
        )
        
        content = response.content[0].text.strip()
        match = re.match(r'^(\d+),(-?\d+),(\d+),(\d+)', content)
        
        if match:
            results["scores"] = {
                'escalation_score': int(match.group(1)),
                'blame_direction': int(match.group(2)),
                'propaganda_level': int(match.group(3)),
                'has_cta': int(match.group(4))
            }
        else:
            results["error"] = f"Parse error: {content}"
            
    except anthropic.RateLimitError as e:
        # Wait and retry once
        time.sleep(5)
        try:
            response = client.messages.create(
                model=MODEL,
                max_tokens=15,
                temperature=0,
                system=TELEGRAM_PROMPT,
                messages=[{"role": "user", "content": context}]
            )
            content = response.content[0].text.strip()
            match = re.match(r'^(\d+),(-?\d+),(\d+),(\d+)', content)
            if match:
                results["scores"] = {
                    'escalation_score': int(match.group(1)),
                    'blame_direction': int(match.group(2)),
                    'propaganda_level': int(match.group(3)),
                    'has_cta': int(match.group(4))
                }
        except Exception as retry_error:
            results["error"] = f"Rate limit: {str(retry_error)}"
            
    except Exception as e:
        results["error"] = str(e)
    
    return results

# Process in parallel (like your working script!)
print(f"\n🚀 Processing with {MAX_WORKERS} concurrent workers...")
print(f"   This should be MUCH faster!\n")

results_dict = {}
start_time = time.time()

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks
    futures = {
        executor.submit(score_message, idx, row): idx 
        for idx, row in df_unscored.iterrows()
    }
    
    # Process completed tasks with progress bar
    success_count = 0
    error_count = 0
    
    for future in tqdm(as_completed(futures), total=len(futures), desc="Scoring"):
        result = future.result()
        results_dict[result["idx"]] = result
        
        if result["scores"]:
            success_count += 1
        else:
            error_count += 1
        
        # Save checkpoint every 500
        if (success_count + error_count) % 500 == 0:
            elapsed = time.time() - start_time
            rate = (success_count + error_count) / elapsed
            tqdm.write(f"💾 Progress: {success_count:,} success, {error_count:,} errors, {rate:.1f} msg/sec")

# Apply results to dataframe
print("\n📊 Applying results...")
for idx in sorted(results_dict.keys()):
    result = results_dict[idx]
    if result["scores"]:
        for col, val in result["scores"].items():
            df_unscored.at[idx, col] = val

# Save results
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
final_file = OUT_DIR / f"telegram_remaining_scored_{timestamp}.csv"
df_unscored.drop(columns=['msg_id'], inplace=True, errors='ignore')
df_unscored.to_csv(final_file, index=False)

# Combine with existing
df_combined = pd.concat([df_scored, df_unscored], ignore_index=True)
final_complete = OUT_DIR / f"telegram_FINAL_COMPLETE_{timestamp}.csv"
df_combined.to_csv(final_complete, index=False)

# Summary
elapsed_total = time.time() - start_time
print(f"\n✅ COMPLETE in {elapsed_total/60:.1f} minutes!")
print(f"   Successfully scored: {success_count:,} / {len(df_unscored):,} ({success_count/len(df_unscored)*100:.1f}%)")
print(f"   Errors: {error_count}")
print(f"   Average rate: {len(df_unscored)/elapsed_total:.1f} messages/second")
print(f"\n📁 Complete dataset saved to: {final_complete}")