In [None]:
# --- repo bootstrap ---------------------------------------------------------
from pathlib import Path
from dotenv import load_dotenv
import os, sys

def repo_root(start: Path) -> Path:
    cur = start.resolve()
    while cur != cur.parent:
        if (cur / ".env").exists() or (cur / ".git").exists():
            return cur
        cur = cur.parent
    raise RuntimeError("repo root not found")

ROOT = repo_root(Path.cwd())
load_dotenv(ROOT / ".env")             # loads secrets
sys.path.append(str(ROOT / "src"))     # optional helpers

DATA_DIR = ROOT / "data"
OUT_DIR  = ROOT / "outputs"
FIG_DIR  = OUT_DIR / "figs"; FIG_DIR.mkdir(exist_ok=True)

print("Repo root:", ROOT)

In [None]:
# ╔══════════════════════════════════════════════════════════════════════╗
# ║  STEP 1 – Load Claude-labelled data                                   ║
# ╚══════════════════════════════════════════════════════════════════════╝
import pandas as pd, numpy as np, os, json, datetime as dt
from pathlib import Path

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
CLAUDE_CSV = ROOT / "outputs" / "telegram_scoring" / "telegram_scored_COMBINED_20250606_124931.csv"

df = pd.read_csv(CLAUDE_CSV)
assert {"escalation_score", "blame_direction", "propaganda_level", "has_cta"}.issubset(df.columns), \
       "CSV missing one of the required score columns."

# Parse a date column if present; fallback to row index
for col in ["created_at", "date", "timestamp"]:
    if col in df.columns:
        df[col] = pd.to_datetime(df[col], errors="coerce")
        DATE_COL = col
        break
else:
    df["row_idx_as_date"] = pd.to_datetime(df.index, unit="D", origin="2022-01-01")
    DATE_COL = "row_idx_as_date"

# ─── parameters ────────────────────────────────────────────────────────
SAMPLE_PER_SCORE = 20        # 11×20 = 220 total (adjust as desired)
RANDOM_SEED      = 42

np.random.seed(RANDOM_SEED)

# ╔══════════════════════════════════════════════════════════════════════╗
# ║  STEP 2 – Stratified sampling across escalation + date               ║
# ╚══════════════════════════════════════════════════════════════════════╝
sampled_frames = []

for esc in range(11):                     # 0 … 10 inclusive
    sub = df[df.escalation_score == esc].copy()
    if sub.empty:
        continue
    
    # split that subset into 4 date quartiles, take roughly equal chunks
    sub["q"] = pd.qcut(sub[DATE_COL], 4, labels=False, duplicates="drop")
    n_per_quartile = max(1, SAMPLE_PER_SCORE // 4)
    
    picks = []
    for q, g in sub.groupby("q"):
        picks.append(g.sample(min(n_per_quartile, len(g)),
                              random_state=RANDOM_SEED))
    picked = pd.concat(picks)
    
    # if we still need more to hit SAMPLE_PER_SCORE, fill randomly
    if len(picked) < SAMPLE_PER_SCORE:
        remaining = sub.drop(picked.index)
        if not remaining.empty:
            picked = pd.concat([
                picked,
                remaining.sample(min(SAMPLE_PER_SCORE - len(picked), len(remaining)),
                                 random_state=RANDOM_SEED)
            ])
    sampled_frames.append(picked)

sampled = pd.concat(sampled_frames).sample(frac=1, random_state=RANDOM_SEED)  # shuffle

print(f"Sampled {len(sampled):,} rows across all escalation levels.")

# ╔══════════════════════════════════════════════════════════════════════╗
# ║  STEP 3 – Add empty “human-verify” columns                           ║
# ╚══════════════════════════════════════════════════════════════════════╝
for col in ["E_verified", "B_verified", "P_verified", "C_verified"]:
    sampled[col] = ""

# Optional: keep only the columns you care about
cols_to_keep = ["message_text", "escalation_score", "blame_direction",
                "propaganda_level", "has_cta",
                "channel_username", DATE_COL] + [c for c in sampled.columns if c.endswith("_verified")]
sampled = sampled[cols_to_keep]

# ╔══════════════════════════════════════════════════════════════════════╗
# ║  STEP 4 – Save for Google Sheets / Streamlit review                  ║
# ╚══════════════════════════════════════════════════════════════════════╝
OUT_DIR = ROOT / "outputs" / "labeling"
OUT_DIR.mkdir(exist_ok=True)
out_path = OUT_DIR / f"telegram_sample_for_review_{dt.datetime.now():%Y%m%d_%H%M%S}.csv"
sampled.to_csv(out_path, index=False)
print("✅ Saved sample to", out_path)


In [None]:
import os, pandas as pd, json, time
from openai import OpenAI
from anthropic import Anthropic
from pathlib import Path
from tqdm.auto import tqdm
import asyncio
import aiohttp
from concurrent.futures import ThreadPoolExecutor, as_completed

openai_client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
anthropic_client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
LABELS = ROOT / "outputs" / "labeling" / "telegram_sample_for_review_20250606_154749.csv"
df = pd.read_csv(LABELS)

RUBRIC = """
You are analysing Telegram posts about the Russia-Ukraine war.

Return **FOUR integers** (E,B,P,C) with NO other text:

E Escalation 0-10
      0 humanitarian / diplomacy
      1 peace-talks mention
      2 political / information warfare
      3 routine battlefield update
      4 major aid package
      5 major weapon systems
      6 advanced / long-range systems
      7 nationwide strikes
      8 strategic-infrastructure attacks
      9 nuclear threats
     10 explicit nuclear-war rhetoric

B Blame
 –1 neutral | 0 blames UA/NATO | 1 blames RU

P Propaganda 0 factual · 1 mild spin · 2 clear propaganda · 3 extreme/false

C Call-to-action 0 none · 1 urges action (donate, enlist, protest, etc.)

**Format**: "E,B,P,C" (e.g. `7,0,2,1`). Nothing else.
"""

def ask_openai(text: str) -> str:
    resp = openai_client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": RUBRIC},
            {"role": "user", "content": text}
        ],
        max_tokens=10,
        temperature=0
    )
    return resp.choices[0].message.content.strip()

def ask_opus(text: str) -> str:
    resp = anthropic_client.messages.create(
        model="claude-opus-4-20250514",
        max_tokens=10,
        temperature=0,
        system=RUBRIC,
        messages=[
            {"role": "user", "content": text}
        ]
    )
    return resp.content[0].text.strip()

def process_row(idx, text):
    """Process a single row with both models"""
    results = {"idx": idx, "openai": None, "opus": None}
    
    try:
        results["openai"] = ask_openai(text)
    except Exception as e:
        print(f"OpenAI error on row {idx}: {e}")
        results["openai"] = "ERROR"
    
    try:
        results["opus"] = ask_opus(text)
    except Exception as e:
        print(f"Opus error on row {idx}: {e}")
        results["opus"] = "ERROR"
    
    return results

# Process in parallel with thread pool
results_dict = {}
with ThreadPoolExecutor(max_workers=10) as executor:
    # Submit all tasks
    futures = {
        executor.submit(process_row, idx, row.message_text[:2000]): idx 
        for idx, row in df.iterrows()
    }
    
    # Process completed tasks with progress bar
    for future in tqdm(as_completed(futures), total=len(futures), desc="Processing"):
        result = future.result()
        results_dict[result["idx"]] = result

# Sort results by index and add to dataframe
for idx in sorted(results_dict.keys()):
    df.at[idx, "openai"] = results_dict[idx]["openai"]
    df.at[idx, "opus"] = results_dict[idx]["opus"]

# Save results
output_path = ROOT / "outputs" / "labeling" / "validation_results.csv"
df.to_csv(output_path, index=False)
print(f"Results saved to {output_path}")

In [None]:
import pandas as pd
import numpy as np
import time
import json
from pathlib import Path

ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
LAB_DIR = ROOT / "outputs" / "labeling"
INFILE = LAB_DIR / "validation_results.csv"

df = pd.read_csv(INFILE)

# Parse the model outputs (they come as "E,B,P,C" strings)
def parse_scores(score_str):
    """Parse 'E,B,P,C' format into individual values"""
    if pd.isna(score_str) or score_str == "ERROR":
        return None, None, None, None
    try:
        parts = score_str.strip().split(',')
        if len(parts) == 4:
            return int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3])
    except:
        pass
    return None, None, None, None

# Parse OpenAI scores
df[['E_openai', 'B_openai', 'P_openai', 'C_openai']] = df['openai'].apply(
    lambda x: pd.Series(parse_scores(x))
)

# Parse Opus scores  
df[['E_opus', 'B_opus', 'P_opus', 'C_opus']] = df['opus'].apply(
    lambda x: pd.Series(parse_scores(x))
)

# If you already have claude-sonnet scores from earlier, rename them
if 'escalation_score' in df.columns:
    df = df.rename(columns={
        'escalation_score': 'E_sonnet',
        'blame_direction': 'B_sonnet',
        'propaganda_level': 'P_sonnet',
        'has_cta': 'C_sonnet'
    })

print(f"Total rows: {len(df)}")
print(f"Rows with valid OpenAI scores: {df['E_openai'].notna().sum()}")
print(f"Rows with valid Opus scores: {df['E_opus'].notna().sum()}")
if 'E_sonnet' in df.columns:
    print(f"Rows with valid Sonnet scores: {df['E_sonnet'].notna().sum()}")

In [None]:
# Define consensus functions
def majority_vote(values):
    """Get majority vote from list of values, handling None"""
    valid_vals = [v for v in values if v is not None]
    if not valid_vals:
        return None
    return max(set(valid_vals), key=valid_vals.count)

def mean_round(values):
    """Get mean and round, handling None"""
    valid_vals = [v for v in values if v is not None]
    if not valid_vals:
        return None
    return int(np.round(np.mean(valid_vals)))

# Compute consensus for each dimension
models = ['openai', 'opus']
if 'E_sonnet' in df.columns:
    models.append('sonnet')

# Escalation: use mean (continuous scale)
df['E_consensus'] = df[[f'E_{m}' for m in models]].apply(
    lambda row: mean_round(row.values), axis=1
)

# Binary/categorical: use majority vote
for dim in ['B', 'P', 'C']:
    df[f'{dim}_consensus'] = df[[f'{dim}_{m}' for m in models]].apply(
        lambda row: majority_vote(row.values), axis=1
    )

# Calculate disagreement metrics
def check_disagreement(row):
    """Check if models disagree significantly"""
    # Escalation: difference > 2 points
    e_vals = [row[f'E_{m}'] for m in models if row[f'E_{m}'] is not None]
    if len(e_vals) >= 2:
        if max(e_vals) - min(e_vals) > 2:
            return True
    
    # Other dimensions: any disagreement
    for dim in ['B', 'P', 'C']:
        vals = [row[f'{dim}_{m}'] for m in models if row[f'{dim}_{m}'] is not None]
        if len(set(vals)) > 1:  # More than one unique value = disagreement
            return True
    
    return False

df['needs_review'] = df.apply(check_disagreement, axis=1)

# Split into auto-accepted and needs-review
auto_accepted = df[~df['needs_review'].fillna(True)].copy()
needs_review = df[df['needs_review'].fillna(True)].copy()

print(f"\n📊 Results:")
print(f"✅ Auto-accepted (high agreement): {len(auto_accepted)} rows")
print(f"⚠️  Needs review (disagreement): {len(needs_review)} rows")
print(f"🚫 Invalid (missing scores): {df['E_consensus'].isna().sum()} rows")

In [None]:
# Analyze where models disagree most
if len(needs_review) > 0:
    print("\n🔍 Disagreement Analysis:")
    
    # Escalation disagreements
    e_disagree = needs_review[[f'E_{m}' for m in models]].copy()
    e_disagree['range'] = e_disagree.max(axis=1) - e_disagree.min(axis=1)
    print(f"\nEscalation score ranges:")
    print(e_disagree['range'].value_counts().sort_index())
    
    # Show examples of high disagreement
    high_disagree = needs_review[needs_review.apply(
        lambda r: max([r[f'E_{m}'] for m in models if r[f'E_{m}'] is not None] or [0]) - 
                  min([r[f'E_{m}'] for m in models if r[f'E_{m}'] is not None] or [0]) > 3, 
        axis=1
    )]
    
    if len(high_disagree) > 0:
        print(f"\n📝 Example of high disagreement (showing first):")
        row = high_disagree.iloc[0]
        print(f"Text: {row['message_text'][:200]}...")
        print(f"Scores: OpenAI={row['E_openai']}, Opus={row['E_opus']}", end="")
        if 'E_sonnet' in df.columns:
            print(f", Sonnet={row['E_sonnet']}")
        else:
            print()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import numpy as np

# Set up the plotting style
plt.style.use('seaborn-v0_8-darkgrid')
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
fig.suptitle('Model Agreement Analysis Across All Dimensions', fontsize=16)

# 1. Escalation Score Comparisons (continuous)
models = ['openai', 'opus', 'sonnet'] if 'E_sonnet' in df.columns else ['openai', 'opus']
model_pairs = [(models[i], models[j]) for i in range(len(models)) for j in range(i+1, len(models))]

for idx, (m1, m2) in enumerate(model_pairs[:3]):  # Max 3 pairs
    ax = axes[0, idx]
    
    # Filter valid scores
    valid_mask = df[f'E_{m1}'].notna() & df[f'E_{m2}'].notna()
    x = df.loc[valid_mask, f'E_{m1}']
    y = df.loc[valid_mask, f'E_{m2}']
    
    # Scatter plot with jitter
    ax.scatter(x + np.random.normal(0, 0.1, len(x)), 
               y + np.random.normal(0, 0.1, len(y)), 
               alpha=0.5)
    
    # Add diagonal line
    ax.plot([0, 10], [0, 10], 'r--', alpha=0.5)
    
    # Calculate correlation
    corr, _ = pearsonr(x, y)
    
    ax.set_xlabel(f'{m1.capitalize()} Escalation')
    ax.set_ylabel(f'{m2.capitalize()} Escalation')
    ax.set_title(f'{m1.capitalize()} vs {m2.capitalize()}\n(r = {corr:.3f})')
    ax.set_xlim(-0.5, 10.5)
    ax.set_ylim(-0.5, 10.5)

# 2. Agreement Heatmaps for Binary/Categorical
dims = ['B', 'P', 'C']
dim_names = {'B': 'Blame', 'P': 'Propaganda', 'C': 'Call-to-Action'}

for idx, dim in enumerate(dims):
    ax = axes[1, idx]
    
    # Create agreement matrix
    agreement_matrix = np.zeros((len(models), len(models)))
    
    for i, m1 in enumerate(models):
        for j, m2 in enumerate(models):
            if i != j:
                valid_mask = df[f'{dim}_{m1}'].notna() & df[f'{dim}_{m2}'].notna()
                if valid_mask.sum() > 0:
                    agreement = (df.loc[valid_mask, f'{dim}_{m1}'] == 
                               df.loc[valid_mask, f'{dim}_{m2}']).mean() * 100
                    agreement_matrix[i, j] = agreement
            else:
                agreement_matrix[i, j] = 100
    
    # Plot heatmap
    sns.heatmap(agreement_matrix, 
                annot=True, 
                fmt='.1f',
                cmap='RdYlGn',
                vmin=0, vmax=100,
                xticklabels=[m.capitalize() for m in models],
                yticklabels=[m.capitalize() for m in models],
                ax=ax,
                cbar_kws={'label': '% Agreement'})
    
    ax.set_title(f'{dim_names[dim]} Agreement %')

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import numpy as np
import json
import time
from pathlib import Path
from sklearn.model_selection import train_test_split

# Load data
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
LAB_DIR = ROOT / "outputs" / "labeling"
INFILE = LAB_DIR / "validation_results.csv"

df = pd.read_csv(INFILE)

# Parse scores if not already done
def parse_scores(score_str):
    if pd.isna(score_str) or score_str == "ERROR":
        return None, None, None, None
    try:
        parts = score_str.strip().split(',')
        if len(parts) == 4:
            return int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3])
    except:
        pass
    return None, None, None, None

# Parse if needed
if 'E_opus' not in df.columns or df['E_opus'].isna().all():
    df[['E_opus', 'B_opus', 'P_opus', 'C_opus']] = df['opus'].apply(
        lambda x: pd.Series(parse_scores(x))
    )

# Rename sonnet columns if they exist
if 'escalation_score' in df.columns:
    df = df.rename(columns={
        'escalation_score': 'E_sonnet',
        'blame_direction': 'B_sonnet', 
        'propaganda_level': 'P_sonnet',
        'has_cta': 'C_sonnet'
    })

print(f"Total rows: {len(df)}")
print(f"Rows with valid Opus scores: {df['E_opus'].notna().sum()}")
print(f"Rows with valid Sonnet scores: {df['E_sonnet'].notna().sum()}")

In [None]:
# Calculate consensus using only Opus and Sonnet
df['E_consensus'] = df[['E_opus', 'E_sonnet']].mean(axis=1).round().astype('Int64')

# For categorical, use Opus as primary with Sonnet as tiebreaker
for dim in ['B', 'P', 'C']:
    # If they agree, use that value; if not, use Opus (it had slightly better performance)
    df[f'{dim}_consensus'] = df.apply(
        lambda r: r[f'{dim}_opus'] if pd.notna(r[f'{dim}_opus']) else r[f'{dim}_sonnet'], 
        axis=1
    ).astype('Int64')

# Calculate agreement metrics
df['E_diff'] = abs(df['E_opus'] - df['E_sonnet'])
df['perfect_agreement'] = (
    (df['E_diff'] <= 1) & 
    (df['B_opus'] == df['B_sonnet']) & 
    (df['P_opus'] == df['P_sonnet']) & 
    (df['C_opus'] == df['C_sonnet'])
)

# Flag rows that need review (using relaxed criteria)
df['needs_review'] = (
    (df['E_diff'] > 3) |  # Large escalation disagreement
    ((df['B_opus'] != df['B_sonnet']) & df['B_opus'].notna() & df['B_sonnet'].notna()) |
    ((df['P_opus'] != df['P_sonnet']) & df['P_opus'].notna() & df['P_sonnet'].notna())
    # Note: Being lenient on C (call-to-action) since they agreed 91% of the time
)

# Remove rows with missing consensus values
valid_mask = df[['E_consensus', 'B_consensus', 'P_consensus', 'C_consensus']].notna().all(axis=1)
df_valid = df[valid_mask].copy()

print(f"\n📊 Opus-Sonnet Agreement Analysis:")
print(f"Perfect agreement: {df_valid['perfect_agreement'].sum()} ({df_valid['perfect_agreement'].sum()/len(df_valid)*100:.1f}%)")
print(f"Escalation within 1 point: {(df_valid['E_diff'] <= 1).sum()} ({(df_valid['E_diff'] <= 1).sum()/len(df_valid)*100:.1f}%)")
print(f"Needs review: {df_valid['needs_review'].sum()} ({df_valid['needs_review'].sum()/len(df_valid)*100:.1f}%)")
print(f"Auto-accepted: {(~df_valid['needs_review']).sum()} ({(~df_valid['needs_review']).sum()/len(df_valid)*100:.1f}%)")

In [None]:
import pandas as pd
import numpy as np
import json
import time
from pathlib import Path
from sklearn.model_selection import train_test_split

# Load and setup (same as before)
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()
LAB_DIR = ROOT / "outputs" / "labeling"
output_dir = ROOT / "outputs" / "fine_tuning"
output_dir.mkdir(exist_ok=True)

INFILE = LAB_DIR / "validation_results.csv"
df = pd.read_csv(INFILE)

# Parse scores (if needed)
def parse_scores(score_str):
    if pd.isna(score_str) or score_str == "ERROR":
        return None, None, None, None
    try:
        parts = score_str.strip().split(',')
        if len(parts) == 4:
            return int(parts[0]), int(parts[1]), int(parts[2]), int(parts[3])
    except:
        pass
    return None, None, None, None

if 'E_opus' not in df.columns or df['E_opus'].isna().all():
    df[['E_opus', 'B_opus', 'P_opus', 'C_opus']] = df['opus'].apply(
        lambda x: pd.Series(parse_scores(x))
    )

if 'escalation_score' in df.columns:
    df = df.rename(columns={
        'escalation_score': 'E_sonnet',
        'blame_direction': 'B_sonnet',
        'propaganda_level': 'P_sonnet',
        'has_cta': 'C_sonnet'
    })

# SIMPLE FILTER: Keep only rows where Opus and Sonnet escalation scores are within 1 point
df['E_diff'] = abs(df['E_opus'] - df['E_sonnet'])
high_agreement = df[df['E_diff'] <= 1].copy()

# Calculate consensus (simple average for E, use Opus for others since it performed well)
high_agreement['E_consensus'] = high_agreement[['E_opus', 'E_sonnet']].mean(axis=1).round().astype(int)
high_agreement['B_consensus'] = high_agreement['B_opus'].astype(int)
high_agreement['P_consensus'] = high_agreement['P_opus'].astype(int)
high_agreement['C_consensus'] = high_agreement['C_opus'].astype(int)

# Remove any with missing values
final_df = high_agreement.dropna(subset=['message_text', 'E_consensus', 'B_consensus', 'P_consensus', 'C_consensus'])

print(f"✅ Using {len(final_df)} high-agreement samples (from {len(df)} total)")
print(f"\nEscalation distribution:")
print(final_df['E_consensus'].value_counts().sort_index())

# Create training examples
SYSTEM_PROMPT = """You are analyzing Telegram posts about the Russia-Ukraine war. Return FOUR integers (E,B,P,C) with NO other text:

E = Escalation (0-10): 0=humanitarian, 5=major weapons, 10=nuclear rhetoric
B = Blame (-1/0/1): -1=neutral, 0=blames Ukraine/NATO, 1=blames Russia  
P = Propaganda (0-3): 0=factual, 3=extreme/false
C = Call-to-action (0/1): 0=none, 1=urges action

Format: "E,B,P,C" (e.g. "7,1,2,0")"""

def create_training_example(row):
    response = f"{int(row['E_consensus'])},{int(row['B_consensus'])},{int(row['P_consensus'])},{int(row['C_consensus'])}"
    return {
        "messages": [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": str(row['message_text'])},
            {"role": "assistant", "content": response}
        ]
    }

# Split 80/20
train_df, val_df = train_test_split(
    final_df,
    test_size=0.2,
    random_state=42,
    stratify=final_df['E_consensus'].clip(upper=7)
)

# Write files
timestamp = time.strftime("%Y%m%d_%H%M%S")
train_file = output_dir / "telegram_train.jsonl"
val_file = output_dir / "telegram_val.jsonl"

with open(train_file, 'w', encoding='utf-8') as f:
    for _, row in train_df.iterrows():
        f.write(json.dumps(create_training_example(row), ensure_ascii=False) + '\n')

with open(val_file, 'w', encoding='utf-8') as f:
    for _, row in val_df.iterrows():
        f.write(json.dumps(create_training_example(row), ensure_ascii=False) + '\n')

print(f"\n🎯 Files ready:")
print(f"Training: {train_file} ({len(train_df)} examples)")
print(f"Validation: {val_file} ({len(val_df)} examples)")
print("\n✅ Upload these to OpenAI and start fine-tuning!")

In [None]:
# FINE-TUNED MODEL EXPLORATION
"""
This notebook explores your 3 fine-tuned models to understand:
1. What input format they expect
2. How they respond
3. Their consistency and accuracy
4. Speed and cost comparisons
"""

import pandas as pd
import numpy as np
from openai import OpenAI
import time
import json
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Initialize
client = OpenAI()
ROOT = Path.cwd().resolve().parents[0] if Path.cwd().name != 'ukraine-final-project' else Path.cwd()

# Your fine-tuned models
FT_MODELS = {
    "mini": {
        "id": "ft:gpt-4o-mini-2024-07-18:politics-ai-research:ukraine-telegram-mini:BfSq29k1",
        "base": "gpt-4o-mini",
        "train_loss": 0.033,
        "valid_loss": 0.149
    },
    "nano": {
        "id": "ft:gpt-4.1-nano-2025-04-14:politics-ai-research:ukraine-classifier-nano:BfSlvv7Q", 
        "base": "gpt-4.1-nano",
        "train_loss": 0.139,
        "valid_loss": 0.190
    },
    "full": {
        "id": "ft:gpt-4.1-2025-04-14:politics-ai-research:ukraine-classifier:BfStxtYw",
        "base": "gpt-4.1",
        "train_loss": 1.377,  # High train loss - might be undertrained?
        "valid_loss": 0.029   # But low valid loss - interesting!
    }
}

print("🎯 Fine-Tuned Models Loaded:")
for name, info in FT_MODELS.items():
    print(f"\n{name.upper()}:")
    print(f"  Model: {info['id']}")
    print(f"  Base: {info['base']}")
    print(f"  Train/Valid Loss: {info['train_loss']:.3f} / {info['valid_loss']:.3f}")

# %%
# Test 1: Basic Functionality - What format do they expect?
print("\n" + "="*60)
print("TEST 1: INPUT FORMAT DISCOVERY")
print("="*60)

# Test different input formats
test_messages = [
    "Russian forces strike Kyiv infrastructure",  # Simple text
    "[Channel: test] Russian forces strike Kyiv",  # With channel
    "M1: Russian forces strike Kyiv",  # With index
    "Score this: Russian forces strike Kyiv",  # With instruction
]

test_prompts = [
    None,  # No system prompt
    "Score the message",  # Simple instruction
    "Return E,B,P,C scores",  # Specific format
    "You are a war message classifier. Score: E(0-10),B(-1/0/1),P(0-3),C(0/1)"  # Full prompt
]

def test_model_response(model_id, user_msg, system_msg=None):
    """Test how model responds to different inputs"""
    messages = []
    if system_msg:
        messages.append({"role": "system", "content": system_msg})
    messages.append({"role": "user", "content": user_msg})
    
    try:
        response = client.chat.completions.create(
            model=model_id,
            messages=messages,
            max_tokens=50,
            temperature=0
        )
        return response.choices[0].message.content
    except Exception as e:
        return f"Error: {str(e)}"

# Test each model with different formats
for model_name, model_info in FT_MODELS.items():
    print(f"\n\n🔍 Testing {model_name.upper()} Model:")
    print("-" * 50)
    
    for msg in test_messages[:2]:  # Test first 2 message formats
        for prompt in test_prompts[:2]:  # Test first 2 prompt types
            result = test_model_response(model_info['id'], msg, prompt)
            print(f"\nInput: '{msg}'")
            if prompt:
                print(f"System: '{prompt}'")
            print(f"Output: {result}")
            
            # Parse if it looks like scores
            if ',' in result and len(result.split(',')) == 4:
                try:
                    scores = [x.strip() for x in result.split(',')]
                    print(f"Parsed: E={scores[0]}, B={scores[1]}, P={scores[2]}, C={scores[3]}")
                except:
                    pass

# %%
# Test 2: Consistency Check - Same message, multiple runs
print("\n" + "="*60)
print("TEST 2: CONSISTENCY CHECK")
print("="*60)

test_message = "NATO announces new military aid package for Ukraine worth $2 billion"
runs_per_model = 5

consistency_results = {}

for model_name, model_info in FT_MODELS.items():
    print(f"\n🔄 Testing {model_name.upper()} consistency ({runs_per_model} runs):")
    
    results = []
    for i in range(runs_per_model):
        response = test_model_response(model_info['id'], test_message)
        results.append(response)
        print(f"  Run {i+1}: {response}")
    
    # Check if all results are identical
    unique_results = set(results)
    consistency_results[model_name] = {
        'results': results,
        'unique': len(unique_results),
        'consistent': len(unique_results) == 1
    }
    
    print(f"  Consistency: {'✅ PERFECT' if len(unique_results) == 1 else f'⚠️  {len(unique_results)} different outputs'}")

# %%
# Test 3: Speed Comparison
print("\n" + "="*60)
print("TEST 3: SPEED & PERFORMANCE")
print("="*60)

# Load sample messages
df = pd.read_csv(ROOT / "outputs" / "telegram_full_20250605_213258.csv", nrows=100)
test_samples = df[df['message_text'].notna()]['message_text'].tolist()[:10]

speed_results = {}

for model_name, model_info in FT_MODELS.items():
    print(f"\n⚡ Testing {model_name.upper()} speed:")
    
    start_time = time.time()
    responses = []
    
    for msg in test_samples:
        response = test_model_response(model_info['id'], msg[:200])
        responses.append(response)
    
    elapsed = time.time() - start_time
    
    speed_results[model_name] = {
        'total_time': elapsed,
        'avg_time': elapsed / len(test_samples),
        'responses': responses
    }
    
    print(f"  Total time: {elapsed:.2f}s")
    print(f"  Avg per message: {elapsed/len(test_samples)*1000:.0f}ms")
    print(f"  Throughput: {len(test_samples)/elapsed:.1f} messages/second")

# %%
# Test 4: Batch Processing Capability
print("\n" + "="*60)
print("TEST 4: BATCH PROCESSING TEST")
print("="*60)

# Try sending multiple messages at once
batch_formats = [
    # Format 1: Newline separated
    "Message 1: Russia attacks Kyiv\nMessage 2: Peace talks resume\nMessage 3: NATO sends aid",
    
    # Format 2: Numbered
    "1. Russia attacks Kyiv\n2. Peace talks resume\n3. NATO sends aid",
    
    # Format 3: Indexed
    "M0: Russia attacks Kyiv\nM1: Peace talks resume\nM2: NATO sends aid",
]

for model_name, model_info in FT_MODELS.items():
    print(f"\n📦 Testing {model_name.upper()} batch capability:")
    
    for i, batch in enumerate(batch_formats):
        response = test_model_response(model_info['id'], batch)
        print(f"\nFormat {i+1} response: {response}")
        
        # Check if it returned multiple scores
        if '|' in response or '\n' in response or response.count(',') > 4:
            print("  ✅ Appears to handle batches!")
        else:
            print("  ❌ Single response only")

# %%
# Test 5: Edge Cases
print("\n" + "="*60)
print("TEST 5: EDGE CASES")
print("="*60)

edge_cases = [
    "",  # Empty
    "привет",  # Non-English
    "🚀💥",  # Emojis only
    "a" * 500,  # Very long
    "NATO NATO NATO NATO",  # Repetitive
    "2+2=4",  # Non-war content
]

for model_name, model_info in FT_MODELS.items():
    print(f"\n🔧 Testing {model_name.upper()} edge cases:")
    
    for case in edge_cases[:3]:  # Test first 3
        response = test_model_response(model_info['id'], case)
        print(f"  '{case[:20]}...' → {response}")

# %%
# Visualization of Results
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Model Loss Comparison
ax = axes[0, 0]
models = list(FT_MODELS.keys())
train_losses = [FT_MODELS[m]['train_loss'] for m in models]
valid_losses = [FT_MODELS[m]['valid_loss'] for m in models]

x = np.arange(len(models))
width = 0.35

ax.bar(x - width/2, train_losses, width, label='Train Loss', alpha=0.8)
ax.bar(x + width/2, valid_losses, width, label='Valid Loss', alpha=0.8)
ax.set_xlabel('Model')
ax.set_ylabel('Loss')
ax.set_title('Training vs Validation Loss')
ax.set_xticks(x)
ax.set_xticklabels(models)
ax.legend()

# 2. Speed Comparison
if speed_results:
    ax = axes[0, 1]
    speeds = [speed_results[m]['avg_time'] * 1000 for m in models]
    ax.bar(models, speeds)
    ax.set_xlabel('Model')
    ax.set_ylabel('Avg Response Time (ms)')
    ax.set_title('Response Speed Comparison')

# 3. Consistency Results
if consistency_results:
    ax = axes[1, 0]
    consistency = [consistency_results[m]['unique'] for m in models]
    colors = ['green' if c == 1 else 'orange' for c in consistency]
    ax.bar(models, consistency, color=colors)
    ax.set_xlabel('Model')
    ax.set_ylabel('Number of Unique Outputs')
    ax.set_title('Consistency Test (5 runs, same input)')
    ax.axhline(y=1, color='green', linestyle='--', alpha=0.5)

# 4. Summary Table
ax = axes[1, 1]
ax.axis('tight')
ax.axis('off')

summary_data = []
for model in models:
    summary_data.append([
        model.upper(),
        f"{FT_MODELS[model]['base']}",
        f"{FT_MODELS[model]['train_loss']:.3f}",
        f"{FT_MODELS[model]['valid_loss']:.3f}",
        f"{speed_results.get(model, {}).get('avg_time', 0)*1000:.0f}ms" if speed_results else "N/A"
    ])

table = ax.table(
    cellText=summary_data,
    colLabels=['Model', 'Base', 'Train Loss', 'Valid Loss', 'Avg Speed'],
    cellLoc='center',
    loc='center'
)
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1, 2)

plt.suptitle('Fine-Tuned Model Comparison', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

# %%
# Final Recommendations
print("\n" + "="*60)
print("📊 ANALYSIS & RECOMMENDATIONS")
print("="*60)

print("\n1. INPUT FORMAT:")
print("   Your models likely expect just the raw message text")
print("   No system prompt needed (it's baked into the fine-tuning)")

print("\n2. OUTPUT FORMAT:")
print("   Models should return: E,B,P,C (4 comma-separated integers)")

print("\n3. BEST MODEL:")
# Analyze which performed best
if consistency_results:
    consistent_models = [m for m in models if consistency_results[m]['consistent']]
    print(f"   Most consistent: {', '.join(consistent_models) if consistent_models else 'None perfectly consistent'}")

if speed_results:
    fastest = min(models, key=lambda m: speed_results[m]['avg_time'])
    print(f"   Fastest: {fastest.upper()} ({speed_results[fastest]['avg_time']*1000:.0f}ms/msg)")

print(f"\n4. CONCERNING OBSERVATIONS:")
if FT_MODELS['full']['train_loss'] > 1.0:
    print("   - 'full' model has high train loss (1.377) - might need more training")
print("   - Test batch processing capability before full-scale run")
print("   - Consider the base model costs (gpt-4.1 is most expensive)")

print("\n5. NEXT STEPS:")
print("   - Run full scoring with best performing model")
print("   - Use batch processing if supported")
print("   - Monitor for consistency issues")