## Load

In [1]:
import squigglepy as sq
import numpy as np

from datetime import datetime
from libs import plot_exponential_growth, fmt_worktime
from model_params import doubling_time, reliability_metric, end_year
from model_data import model_data

print("Loaded libraries")

Loaded libraries


## Longer-term METR Predictions

In [2]:
def generate_metr_predictions(ref_model, model_data, doubling_time, n_samples=50000):
    """Generate quarterly METR task horizon predictions for a given reference model."""
    
    reliability_metrics = [
        ('performance_50p', '50%'),
        ('performance_80p', '80%'),
    ]
    
    baseline_launch_date = model_data[ref_model]['launch_date']
    model_label = ref_model.upper()
    
    def end_of_quarter(year, q):
        return datetime(year, [3, 6, 9, 12][q-1], [31, 30, 30, 31][q-1])
    
    quarters = [
        (year, q, end_of_quarter(year, q))
        for year in range(2025, 2031)
        for q in range(4 if year == 2025 else 1, 5)
    ]
    
    tau_samples = sq.sample(doubling_time, n=n_samples)
    all_results = {}
    
    for reliability_metric, reliability_label in reliability_metrics:
        baseline_task_length = model_data[ref_model][reliability_metric]
        
        print(f"METR Task Horizon Predictions ({reliability_label} reliability) - {model_label}")
        print("=" * 70)
        print(f"Baseline: {model_label} at {baseline_task_length:.1f}hr on {baseline_launch_date.strftime('%Y-%m-%d')}")
        print("=" * 70)
        print()
        
        print(f"{'Quarter':<8} {'End Date':<12} {'Mean':<10} {'Median':<10} {'90% CI':<25} {'Days from baseline'}")
        print("-" * 85)
        
        results = []
        for year, q, quarter_date in quarters:
            days_from_baseline = (quarter_date - baseline_launch_date).days
            if days_from_baseline < 0:
                continue
            
            doublings = days_from_baseline / tau_samples
            task_lengths = baseline_task_length * (2 ** doublings)
            
            mean_val, median_val = np.mean(task_lengths), np.median(task_lengths)
            p5, p95 = np.percentile(task_lengths, 5), np.percentile(task_lengths, 95)
            
            ci_str = f"[{fmt_worktime(p5)} - {fmt_worktime(p95)}]"
            print(f"{year}Q{q:<7} {quarter_date.strftime('%Y-%m-%d'):<12} {fmt_worktime(mean_val):<10} {fmt_worktime(median_val):<10} {ci_str:<25} {days_from_baseline:>4}")
            
            results.append({
                "quarter": f"{year}Q{q}", "date": quarter_date,
                "mean": mean_val, "median": median_val, "p5": p5, "p95": p95,
                "days": days_from_baseline, "task_lengths": task_lengths,
            })
            if q == 4:
                print()
        
        # Key milestones
        print("\n" + "=" * 70)
        print(f"Key Milestones (Median Estimates) - {model_label} ({reliability_label})")
        print("=" * 70)
        
        milestones = [(2, "2 hours"), (4, "4 hours"), (8, "1 day"), (16, "2 days"),
                      (24, "3 days"), (40, "1 week"), (174, "1 month")]
        
        for hours, name in milestones:
            for r in results:
                if r["median"] >= hours:
                    prob = np.mean(r["task_lengths"] >= hours) * 100
                    print(f"{name:<12} → {r['quarter']} ({r['date'].strftime('%B %d, %Y')}, {prob:.0f}% of samples)")
                    break
            else:
                print(f"{name:<12} → Not reached by 2030Q4")
        
        all_results[reliability_metric] = results
        print("\n\n")
    
    return all_results

results_gpt5 = generate_metr_predictions('gpt5', model_data, doubling_time)

METR Task Horizon Predictions (50% reliability) - GPT5
Baseline: GPT5 at 2.3hr on 2025-08-07

Quarter  End Date     Mean       Median     90% CI                    Days from baseline
-------------------------------------------------------------------------------------
2025Q4       2025-12-31   4.2hr      3.9hr      [3.1hr - 6.0hr]            146

2026Q1       2026-03-31   6.2hr      5.5hr      [3.8hr - 1.4d]             236
2026Q2       2026-06-30   1.2d       7.7hr      [4.5hr - 2.5d]             327
2026Q3       2026-09-30   1.9d       1.4d       [5.5hr - 4.6d]             419
2026Q4       2026-12-31   3.3d       1.9d       [6.6hr - 1.7wk]            511

2027Q1       2027-03-31   1.2wk      2.6d       [1.0d - 3.0wk]             601
2027Q2       2027-06-30   2.5wk      3.7d       [1.2d - 1.3mo]             692
2027Q3       2027-09-30   1.4mo      1.0wk      [1.5d - 2.3mo]             784
2027Q4       2027-12-31   4.1mo      1.5wk      [1.8d - 4.3mo]             876

2028Q1       2028

In [3]:
results_claude_4p5_opus = generate_metr_predictions('claude_4p5_opus', model_data, doubling_time)

METR Task Horizon Predictions (50% reliability) - CLAUDE_4P5_OPUS
Baseline: CLAUDE_4P5_OPUS at 4.8hr on 2025-11-24

Quarter  End Date     Mean       Median     90% CI                    Days from baseline
-------------------------------------------------------------------------------------
2025Q4       2025-12-31   5.6hr      5.5hr      [5.2hr - 6.1hr]             37

2026Q1       2026-03-31   1.0d       7.7hr      [6.3hr - 1.4d]             127
2026Q2       2026-06-30   1.5d       1.4d       [7.6hr - 2.5d]             218
2026Q3       2026-09-30   2.3d       1.9d       [1.1d - 4.6d]              310
2026Q4       2026-12-31   3.7d       2.7d       [1.4d - 1.7wk]             402

2027Q1       2027-03-31   1.2wk      3.7d       [1.7d - 3.1wk]             492
2027Q2       2027-06-30   2.2wk      1.0wk      [2.0d - 1.3mo]             583
2027Q3       2027-09-30   4.3wk      1.5wk      [2.5d - 2.4mo]             675
2027Q4       2027-12-31   2.3mo      2.1wk      [3.0d - 4.3mo]             

In [4]:
plot_exponential_growth(
    doubling_time_days=doubling_time,
    start_hours=baseline_task_length,
    start_date=baseline_launch_date,
    agi_task_length=100_000,
    shift=0,
    n_quarters=24,
    n_samples=100_000,
    n_traces=200,
    max_task_power=10,
)

NameError: name 'baseline_task_length' is not defined

## Shorter-term predictions

#### Predict Claude 4 Opus

In [None]:
def predict(from_model, for_model, debug=False, verbose=True):
    if isinstance(from_model, str):
        from_model = model_data[from_model]
    if isinstance(for_model, str):
        for_model = model_data[for_model]

    if verbose:
        print(f"PREDICT {from_model['name']} -> {for_model['name']}")
    days_since = (for_model['launch_date'] - from_model['launch_date']).days

    def extrapolation():
        individual_model_idiosyncratic_variation = sq.norm(0.85, 1/0.85, lclip=0.1)
        doublings = days_since / doubling_time
        return from_model['performance_50p'] * (2**doublings) * individual_model_idiosyncratic_variation

    if debug:
        return {'from_model': from_model,
                'for_model': for_model,
                'days_since': days_since,
                'doubling_time': doubling_time,
                'doublings': days_since / doubling_time,
                'from_perf': from_model['performance_50p']}
    else:
        samples = sq.sample(extrapolation, n=50_000, verbose=True)
        mean_ci = sq.get_mean_and_ci(samples, credibility=80)
        if verbose:
            print(f"Predicted: {fmt_worktime(mean_ci['mean'])} (80%CI: {fmt_worktime(mean_ci['ci_low'])} - {fmt_worktime(mean_ci['ci_high'])})")
    
        if for_model.get('performance_50p') and verbose:
            print(f"Actual observed: {for_model['name']} at {for_model['performance_50p']:.1f}hr")
        return samples


def multi_predict(from_models, for_model, verbose=True):
    pred_samples = []
    for from_model in from_models:
        pred_samples.append(predict(from_model, for_model, debug=False, verbose=True))
        if verbose:
            print('')
    if verbose:
            print('- MIXED MODEL -')
    pred_samples = np.concatenate(pred_samples)
    mean_ci = sq.get_mean_and_ci(pred_samples, credibility=80)
    
    if verbose:
        print(f"Predicted: {fmt_worktime(mean_ci['mean'])} (80%CI: {fmt_worktime(mean_ci['ci_low'])} - {fmt_worktime(mean_ci['ci_high'])})")

    bins = [0, 1.5, 2, 2.5, 3, 3.5, 4, 5, 6, 7, 99999]
    labels = ['<1.5h', '1.5h - 2h', '2h - 2.5h', '2.5h - 3h', '3h - 3.5h', 
              '3.5h - 4h', '4h - 5h', '5h - 6h', '6h - 7h', '>7h']
    counts, _ = np.histogram(pred_samples, bins=bins)
    percentages = 100 * counts / len(pred_samples)
    if verbose:
        print('')
        for label, pct in zip(labels, percentages):
            print(f"{label} {pct:.1f}%")
            
    if isinstance(for_model, str):
        for_model = model_data[for_model]
    if for_model.get('performance_50p') and verbose:
        print('')
        print(f"Actual observed: {for_model['name']} at {for_model['performance_50p']:.1f}hr")
        
    return pred_samples

_ = multi_predict(from_models=['o3', 'claude_3p7_sonnet', 'claude_3p5_sonnet_new'], for_model='claude_4_opus')

#### Predict GPT5

In [None]:
_ = multi_predict(from_models=['o1', 'o3'], for_model='gpt5')

#### Predict Claude 4.5 Sonnet

In [None]:
_ = multi_predict(from_models=['gpt5', 'claude_4_opus', 'claude_4p1_opus'], for_model='claude_4p5_sonnet')

#### Predict GPT5.1-Codex-Max

In [None]:
_ = multi_predict(from_models=['o1', 'gpt5'], for_model='gpt5.1-codex-max')

#### Predict Gemini 3

In [None]:
_ = multi_predict(from_models=['gpt5', 'claude_4_opus', 'gemini_2p5_pro'], for_model='gemini_3')

#### Predict Claude 4.5 Opus

In [None]:
_ = multi_predict(from_models=['claude_4_opus', 'claude_4p1_opus', 'claude_4p5_sonnet'], for_model='claude_4p5_opus')

#### Predict GPT5.2

In [None]:
_ = multi_predict(from_models=['gpt5', 'gpt5.1-codex-max', 'claude_4p1_opus'], for_model='gpt5.2')

#### Predict GPT-5.2-Codex

In [None]:
_ = multi_predict(from_models=['gpt5', 'gpt5.1-codex-max', 'claude_4p1_opus'], for_model='gpt5.2codex')

#### Predict today

In [None]:
today = {'name': 'TODAY', 'launch_date': datetime.now()}
_ = multi_predict(from_models=['gpt5', 'gpt5.1-codex-max', 'claude_4p1_opus'], for_model=today)