In [None]:
# --- repo bootstrap ---------------------------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys
import subprocess, sys, importlib, os, re
from datetime import datetime
import truthbrush as tb

def repo_root(start: Path) -> Path:
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / ".env").exists() or (cur / ".git").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("repo root not found")

ROOT = repo_root(Path.cwd())
load_dotenv(ROOT / ".env")             # loads secrets
sys.path.append(str(ROOT / "src"))     # optional helpers

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs"
FIG_DIR  = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True)

print("Repo root:", ROOT)

## First Pass

In [None]:
# FIXED VERSION - Use the EXACT training prompt
import os
from openai import OpenAI
import pandas as pd
from pathlib import Path

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# THE EXACT SYSTEM PROMPT FROM TRAINING
SYSTEM_PROMPT = """You are analyzing Telegram posts about the Russia-Ukraine war. Return FOUR integers (E,B,P,C) with NO other text:

E = Escalation (0-10): 0=humanitarian, 5=major weapons, 10=nuclear rhetoric
B = Blame (-1/0/1): -1=neutral, 0=blames Ukraine/NATO, 1=blames Russia  
P = Propaganda (0-3): 0=factual, 3=extreme/false
C = Call-to-action (0/1): 0=none, 1=urges action

Format: "E,B,P,C" (e.g. "7,1,2,0")"""

# Your models
FT_MODELS = {
    "mini": "ft:gpt-4o-mini-2024-07-18:politics-ai-research:ukraine-telegram-mini:BfSq29k1",
    "nano": "ft:gpt-4.1-nano-2025-04-14:politics-ai-research:ukraine-classifier-nano:BfSlvv7Q"
    #"full": "ft:gpt-4.1-2025-04-14:politics-ai-research:ukraine-classifier:BfStxtYw"
}

# def classify_message(model_id, message_text):
#     """Classify using the EXACT training format"""
#     response = client.chat.completions.create(
#         model=model_id,
#         messages=[
#             {"role": "system", "content": SYSTEM_PROMPT},
#             {"role": "user", "content": message_text}
#         ],
#         max_tokens=10,  # Only need 4 numbers + commas
#         temperature=0    # CRITICAL for consistency
#     )
#     return response.choices[0].message.content.strip()

# # Test messages
# test_messages = [
#     "Putin announces new nuclear doctrine changes in response to Western arms shipments",
#     "Red Cross delivers humanitarian aid to civilians in Mariupol",
#     "NATO countries pledge additional military support package worth $5 billion",
#     "BREAKING: Explosions reported across multiple Ukrainian cities, air raid sirens active",
#     "Peace talks scheduled for next week in Geneva"
# ]

# print("🎯 TESTING WITH CORRECT SYSTEM PROMPT\n")
# print("Expected format: E,B,P,C (e.g., '7,1,2,0')")
# print("="*60)

# for model_name, model_id in FT_MODELS.items():
#     print(f"\n📊 {model_name.upper()} Model:")
#     print("-"*50)
    
#     for msg in test_messages:
#         result = classify_message(model_id, msg)
#         print(f"Message: {msg[:60]}...")
#         print(f"Result:  {result}")
        
#         # Validate format
#         if ',' in result and len(result.split(',')) == 4:
#             try:
#                 e, b, p, c = result.split(',')
#                 e, b, p, c = int(e), int(b), int(p), int(c)
#                 if 0 <= e <= 10 and -1 <= b <= 1 and 0 <= p <= 3 and 0 <= c <= 1:
#                     print("✅ Valid format!")
#                 else:
#                     print("⚠️  Values out of range")
#             except:
#                 print("❌ Parse error")
#         else:
#             print("❌ Wrong format - not 4 comma-separated values")
#         print()

## Comprehensive Comparison

In [None]:
import pandas as pd
import numpy as np
from openai import OpenAI
import time
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import cohen_kappa_score
import json

# Setup
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
    organization="org-d28nmVmBQpF2eNppsJOqaB9l",
    project="proj_SXBV23aZ3XH51x5y1qwF48jV"
)

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()

SYSTEM_PROMPT = """You are analyzing Telegram posts about the Russia-Ukraine war. Return FOUR integers (E,B,P,C) with NO other text:

E = Escalation (0-10): 0=humanitarian, 5=major weapons, 10=nuclear rhetoric
B = Blame (-1/0/1): -1=neutral, 0=blames Ukraine/NATO, 1=blames Russia  
P = Propaganda (0-3): 0=factual, 3=extreme/false
C = Call-to-action (0/1): 0=none, 1=urges action

Format: "E,B,P,C" (e.g. "7,1,2,0")"""

FT_MODELS = {
    "mini": "ft:gpt-4o-mini-2024-07-18:politics-ai-research:ukraine-telegram-mini:BfSq29k1",
    "nano": "ft:gpt-4.1-nano-2025-04-14:politics-ai-research:ukraine-classifier-nano:BfSlvv7Q",
    "full": "ft:gpt-4.1-2025-04-14:politics-ai-research:ukraine-classifier:BfStxtYw"
}

def classify_message(model_id, message_text, max_retries=3):
    """Classify with retry logic"""
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model=model_id,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": message_text[:1000]}  # Truncate long messages
                ],
                max_tokens=10,
                temperature=0
            )
            return response.choices[0].message.content.strip()
        except Exception as e:
            if attempt == max_retries - 1:
                return None
            time.sleep(1)
    return None

def parse_scores(score_str):
    """Parse E,B,P,C string into dict"""
    try:
        parts = score_str.split(',')
        if len(parts) == 4:
            return {
                'E': int(parts[0]),
                'B': int(parts[1]),
                'P': int(parts[2]),
                'C': int(parts[3])
            }
    except:
        pass
    return None

In [None]:
import logging
import pandas as pd
import numpy as np
from openai import OpenAI
import time
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading
from collections import defaultdict
from datetime import datetime, timedelta

# Disable httpx INFO logs
logging.getLogger("httpx").setLevel(logging.WARNING)

# Setup
client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY")
)

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()

SYSTEM_PROMPT = """You are analyzing Telegram posts about the Russia-Ukraine war. Return FOUR integers (E,B,P,C) with NO other text:

E = Escalation (0-10): 0=humanitarian, 5=major weapons, 10=nuclear rhetoric
B = Blame (-1/0/1): -1=neutral, 0=blames Ukraine/NATO, 1=blames Russia  
P = Propaganda (0-3): 0=factual, 3=extreme/false
C = Call-to-action (0/1): 0=none, 1=urges action

Format: "E,B,P,C" (e.g. "7,1,2,0")"""

FT_MODELS = {
    "mini": "ft:gpt-4o-mini-2024-07-18:politics-ai-research:ukraine-telegram-mini:BfSq29k1",
    "nano": "ft:gpt-4.1-nano-2025-04-14:politics-ai-research:ukraine-classifier-nano:BfSlvv7Q",
    "full": "ft:gpt-4.1-2025-04-14:politics-ai-research:ukraine-classifier:BfStxtYw"
}

In [None]:
print("🔍 DISCOVERING RATE LIMITS FOR EACH MODEL")
print("="*60)

def test_rate_limits(model_id, model_name, test_duration=30):
    """Test rate limits by hammering the API for a short duration"""
    print(f"\nTesting {model_name}...")
    
    test_msg = "Test message for rate limit discovery"
    successful_calls = 0
    errors = []
    start_time = time.time()
    
    # Track calls per second
    call_times = []
    
    with tqdm(total=test_duration, desc=f"{model_name} rate test", unit="sec") as pbar:
        while time.time() - start_time < test_duration:
            try:
                response = client.chat.completions.create(
                    model=model_id,
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": test_msg}
                    ],
                    max_tokens=10,
                    temperature=0
                )
                successful_calls += 1
                call_times.append(time.time())
                
            except Exception as e:
                error_msg = str(e)
                if "rate limit" in error_msg.lower():
                    errors.append(("rate_limit", time.time(), error_msg))
                    # Extract wait time if available
                    import re
                    wait_match = re.search(r'Please retry after (\d+)s', error_msg)
                    if wait_match:
                        wait_time = int(wait_match.group(1))
                        print(f"\n  ⚠️ Rate limit hit! Wait time: {wait_time}s")
                        time.sleep(wait_time)
                else:
                    errors.append(("other", time.time(), error_msg))
            
            # Update progress
            elapsed = time.time() - start_time
            pbar.update(elapsed - pbar.n)
    
    # Calculate statistics
    total_time = time.time() - start_time
    
    # Calculate calls per minute/second
    if call_times:
        # Calls per minute (last 60 seconds)
        recent_calls = [t for t in call_times if t > time.time() - 60]
        rpm = len(recent_calls)
        
        # Average time between calls
        if len(call_times) > 1:
            intervals = [call_times[i+1] - call_times[i] for i in range(len(call_times)-1)]
            avg_interval = np.mean(intervals)
            max_rps = 1 / avg_interval if avg_interval > 0 else float('inf')
        else:
            max_rps = successful_calls / total_time
    else:
        rpm = 0
        max_rps = 0
    
    results = {
        'model': model_name,
        'successful_calls': successful_calls,
        'total_errors': len(errors),
        'rate_limit_errors': len([e for e in errors if e[0] == 'rate_limit']),
        'calls_per_minute': rpm,
        'max_calls_per_second': max_rps,
        'avg_time_per_call': total_time / successful_calls if successful_calls > 0 else 0
    }
    
    print(f"\n  ✅ Results for {model_name}:")
    print(f"     Successful calls: {successful_calls}")
    print(f"     Rate limit errors: {results['rate_limit_errors']}")
    print(f"     Estimated RPM: {results['calls_per_minute']:.0f}")
    print(f"     Max RPS: {results['max_calls_per_second']:.2f}")
    
    return results

# Test each model
rate_limits = {}
for model_name, model_id in FT_MODELS.items():
    rate_limits[model_name] = test_rate_limits(model_id, model_name, test_duration=20)
    time.sleep(2)  # Brief pause between models

# Summary
print("\n" + "="*60)
print("📊 RATE LIMIT SUMMARY")
print("="*60)
for model_name, limits in rate_limits.items():
    print(f"\n{model_name.upper()}:")
    print(f"  Max RPM: ~{limits['calls_per_minute']:.0f}")
    print(f"  Max RPS: ~{limits['max_calls_per_second']:.2f}")
    print(f"  Safe parallel workers: {int(limits['max_calls_per_second'] * 0.8)}")  # 80% of max

In [None]:
print("\n" + "="*60)
print("🚀 TESTING PARALLEL THROUGHPUT")
print("="*60)

def test_parallel_rate(model_id, model_name, duration=20, parallel_workers=20):
    """Simple parallel test with verbose output"""
    print(f"\nTesting {model_name} with {parallel_workers} parallel workers for {duration} seconds...")
    
    # Tracking variables
    successful_calls = 0
    failed_calls = 0
    call_times = []
    errors = []
    lock = threading.Lock()
    stop_time = time.time() + duration
    
    def worker():
        """Worker function that continuously makes requests"""
        worker_calls = 0
        while time.time() < stop_time:
            try:
                start = time.time()
                response = client.chat.completions.create(
                    model=model_id,
                    messages=[
                        {"role": "system", "content": SYSTEM_PROMPT},
                        {"role": "user", "content": "Test escalation message"}
                    ],
                    max_tokens=10,
                    temperature=0
                )
                
                with lock:
                    nonlocal successful_calls
                    successful_calls += 1
                    call_times.append(time.time())
                    worker_calls += 1
                    
            except Exception as e:
                with lock:
                    nonlocal failed_calls
                    failed_calls += 1
                    if "rate limit" in str(e).lower():
                        errors.append(("rate_limit", str(e)))
                    else:
                        errors.append(("other", str(e)))
        
        return worker_calls
    
    # Progress tracking
    start_time = time.time()
    last_print_time = start_time
    last_print_calls = 0
    
    # Start workers
    with ThreadPoolExecutor(max_workers=parallel_workers) as executor:
        # Submit all workers
        futures = [executor.submit(worker) for _ in range(parallel_workers)]
        
        # Monitor progress every second
        while time.time() < stop_time:
            time.sleep(1)
            
            # Print progress
            elapsed = time.time() - start_time
            current_calls = successful_calls
            calls_this_second = current_calls - last_print_calls
            overall_rps = current_calls / elapsed if elapsed > 0 else 0
            
            print(f"  [{elapsed:>4.0f}s] Calls: {current_calls:>4} | "
                  f"RPS (last sec): {calls_this_second:>3} | "
                  f"RPS (avg): {overall_rps:>5.1f} | "
                  f"Errors: {failed_calls}")
            
            last_print_time = time.time()
            last_print_calls = current_calls
            
            # Check for rate limit errors
            rate_limit_errors = [e for e in errors if e[0] == "rate_limit"]
            if len(rate_limit_errors) > 3:
                print(f"  ⚠️ Multiple rate limit errors detected!")
        
        # Wait for all workers to finish
        worker_results = [f.result() for f in futures]
    
    # Final stats
    total_duration = time.time() - start_time
    final_rps = successful_calls / total_duration if total_duration > 0 else 0
    
    # Calculate sustained RPS (last 10 seconds)
    recent_calls = [t for t in call_times if t > time.time() - 10]
    sustained_rps = len(recent_calls) / min(10, total_duration) if recent_calls else 0
    
    print(f"\n  📊 Final Results for {model_name}:")
    print(f"     Total calls: {successful_calls}")
    print(f"     Failed calls: {failed_calls}")
    print(f"     Average RPS: {final_rps:.2f}")
    print(f"     Sustained RPS (last 10s): {sustained_rps:.2f}")
    print(f"     Rate limit errors: {len([e for e in errors if e[0] == 'rate_limit'])}")
    print(f"     Calls per worker: {[r for r in worker_results]}")
    
    return {
        'model': model_name,
        'workers': parallel_workers,
        'successful_calls': successful_calls,
        'failed_calls': failed_calls,
        'avg_rps': final_rps,
        'sustained_rps': sustained_rps,
        'rate_limit_errors': len([e for e in errors if e[0] == "rate_limit"])
    }

# Test each model with different worker counts
results = {}
for model_name, model_id in FT_MODELS.items():
    # Test with 5 workers first
    results[model_name] = test_parallel_rate(model_id, model_name, duration=20, parallel_workers=20)
    
    # If no rate limit errors, try with more workers
    if results[model_name]['rate_limit_errors'] == 0:
        print(f"\n  🔄 No rate limits hit, testing with 50 workers...")
        results_10 = test_parallel_rate(model_id, model_name, duration=20, parallel_workers=30)
        
        # Use the better result
        if results_10['avg_rps'] > results[model_name]['avg_rps']:
            results[model_name] = results_10
    
    time.sleep(3)  # Pause between models

# Summary
print("\n" + "="*60)
print("📊 PARALLEL PROCESSING SUMMARY")
print("="*60)

for model_name, result in results.items():
    print(f"\n{model_name.upper()}:")
    print(f"  Best config: {result['workers']} workers")
    print(f"  Throughput: {result['avg_rps']:.1f} calls/second")
    print(f"  Can process 1K messages in: {1000/result['avg_rps']:.0f} seconds")

# Recommendation
min_rps = min(r['avg_rps'] for r in results.values())
recommended_workers = min(r['workers'] for r in results.values())

print(f"\n🎯 RECOMMENDATION FOR 174K MESSAGES:")
print(f"  Use {recommended_workers} parallel workers (safe for all models)")
print(f"  Expected throughput: {min_rps:.1f} messages/second")
print(f"  Time for 174K messages: {174000/min_rps/60:.0f} minutes ({174000/min_rps/3600:.1f} hours)")
print(f"  Cost estimate: ~${174000 * 0.00015:.2f} for GPT-4o-mini")