In [9]:
# Step 1: Download and process Delphi-2M data
from download_delphi_data import download_delphi_csv, load_delphi_data, process_delphi_data
from pathlib import Path

# Download if not already downloaded
delphi_csv = Path("/Users/sarahurbut/aladynoulli2/claudefile/output/delphi_supplementary.csv")
if not delphi_csv.exists():
    print("Downloading Delphi-2M data...")
    download_delphi_csv()
else:
    print(f"Delphi CSV already exists at: {delphi_csv}")

# Process the data
if delphi_csv.exists():
    df = load_delphi_data(delphi_csv)
    delphi_results = process_delphi_data(df)
    
    # Save processed results
    import pickle
    delphi_pkl = Path("/Users/sarahurbut/aladynoulli2/claudefile/output/delphi_results.pkl")
    with open(delphi_pkl, 'wb') as f:
        pickle.dump(delphi_results, f)
    print(f"\n✓ Processed {len(delphi_results)} diseases from Delphi-2M")
    print(f"✓ Saved to: {delphi_pkl}")
else:
    print("✗ Could not download Delphi data")

# Step 2: Run comparison
print("\n" + "="*80)
print("Running comparison...")
print("="*80 + "\n")

from delphi_comparison import * 

Delphi CSV already exists at: /Users/sarahurbut/aladynoulli2/claudefile/output/delphi_supplementary.csv
Loaded 1270 rows from Delphi-2M data

✓ Processed 26 diseases from Delphi-2M
✓ Saved to: /Users/sarahurbut/aladynoulli2/claudefile/output/delphi_results.pkl

Running comparison...



In [None]:
# Display comparison table
import pandas as pd
import numpy as np
from pathlib import Path

# Load the comparison CSV that was generated
comparison_path = Path("/Users/sarahurbut/aladynoulli2/claudefile/output/delphi_comparison_table.csv")
if comparison_path.exists():
    comparison_df = pd.read_csv(comparison_path)
    
    # Format for display
    display_df = comparison_df.copy()
    
    # Format AUC columns to 3 decimal places
    for col in ["AUC_0yr_immediate", "AUC_5yr", "AUC_10yr", "Delphi_1yr", "Delphi_5yr", "Delphi_10yr"]:
        if col in display_df.columns:
            display_df[col] = display_df[col].apply(
                lambda x: f"{x:.3f}" if pd.notna(x) else "—"
            )
    
    # Calculate differences where both are available
    if "AUC_0yr_immediate" in display_df.columns and "Delphi_1yr" in display_df.columns:
        display_df["Diff_1yr"] = (
            comparison_df["AUC_0yr_immediate"] - comparison_df["Delphi_1yr"]
        ).apply(lambda x: f"{x:+.3f}" if pd.notna(x) else "—")
    
    # Reorder columns for better display
    display_cols = ["Disease"]
    if "AUC_0yr_immediate" in display_df.columns:
        display_cols.append("AUC_0yr_immediate")
    if "Delphi_1yr" in display_df.columns:
        display_cols.append("Delphi_1yr")
    if "Diff_1yr" in display_df.columns:
        display_cols.append("Diff_1yr")
    if "AUC_5yr" in display_df.columns:
        display_cols.append("AUC_5yr")
    if "AUC_10yr" in display_df.columns:
        display_cols.append("AUC_10yr")
    
    print("=" * 100)
    print("ALADYNOULLI vs DELPHI-2M COMPARISON")
    print("=" * 100)
    print("\nNote: Comparing Aladynoulli '0-year washout' (immediate prediction) vs Delphi-2M '1-year gap' (prospective prediction)")
    print("      Positive differences indicate Aladynoulli performs better\n")
    
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', 30)
    
    print(display_df[display_cols].to_string(index=False))
    
    # Summary statistics
    print("\n" + "=" * 100)
    print("SUMMARY STATISTICS")
    print("=" * 100)
    
    if "AUC_0yr_immediate" in comparison_df.columns and "Delphi_1yr" in comparison_df.columns:
        valid_diffs = (comparison_df["AUC_0yr_immediate"] - comparison_df["Delphi_1yr"]).dropna()
        if len(valid_diffs) > 0:
            print(f"\n1-Year Comparison (Aladynoulli 0yr immediate - Delphi-2M 1yr gap):")
            print(f"  Diseases with data: {len(valid_diffs)}")
            print(f"  Mean difference: {valid_diffs.mean():.4f}")
            print(f"  Median difference: {valid_diffs.median():.4f}")
            print(f"  Range: [{valid_diffs.min():.4f}, {valid_diffs.max():.4f}]")
            print(f"  Aladynoulli better: {(valid_diffs > 0).sum()} / {len(valid_diffs)}")
            print(f"  Delphi-2M better: {(valid_diffs < 0).sum()} / {len(valid_diffs)}")
else:
    print("Comparison table not found. Run the first cell to generate it.")


ALADYNOULLI vs DELPHI-2M COMPARISON

Note: Delphi-2M '1-year gap' AUCs are comparable to Aladynoulli '1-year washout' (prospective predictions)
      Positive differences indicate Aladynoulli performs better

             Disease AUC_1yr_washout Delphi_1yr Diff_1yr AUC_5yr AUC_10yr
          Parkinsons           0.729          —        —   0.733    0.724
               ASCVD           0.701          —        —   0.724    0.691
     Prostate_Cancer           0.707          —        —   0.722    0.682
      Bladder_Cancer           0.689          —        —   0.721    0.690
          Atrial_Fib           0.687          —        —   0.706    0.700
       Heart_Failure           0.710          —        —   0.706    0.698
         Lung_Cancer           0.671          —        —   0.690    0.664
        Osteoporosis           0.670          —        —   0.686    0.666
                 CKD           0.702          —        —   0.685    0.710
         All_Cancers           0.685          —    