## Load

In [1]:
import squigglepy as sq
import numpy as np

from datetime import datetime
from libs import plot_exponential_growth, fmt_worktime
from model_params import doubling_time, end_year
from model_data import model_data

print("Loaded libraries")

apply_starting_task_length_modifiers = True
print_step_outputs = False

HACCA mode: True, new_hacca_mode: True, reliability metric: performance_50p, capability area: General, custom_doubling_time: False, custom_start_task_length: None hrs

## START task length (displayed in sec) ##
{1: 1.0,
 5: 5.33,
 10: 20.88,
 20: 87.82,
 30: 206.93,
 40: 400.63,
 50: 733.48,
 60: 1284.78,
 70: 2276.23,
 80: 4365.92,
 90: 10274.4,
 95: 20868.56,
 99: 78857.02}


## HACCA task length (displayed in hrs) ##
{1: 40,
 5: 51,
 10: 80,
 20: 138,
 30: 206,
 40: 289,
 50: 397,
 60: 543,
 70: 767,
 80: 1140,
 90: 1990,
 95: 3159,
 99: 7459}


## DOUBLING TIME (displayed in days) ##
{1: 83,
 5: 105,
 10: 119,
 20: 139,
 30: 156,
 40: 171,
 50: 187,
 60: 204,
 70: 225,
 80: 252,
 90: 294,
 95: 334,
 99: 421}


## SHIFT (displayed in days) ##
{1: 0,
 5: 30,
 10: 56,
 20: 88,
 30: 112,
 40: 132,
 50: 150,
 60: 168,
 70: 188,
 80: 211,
 90: 243,
 95: 270,
 99: 320}
Loaded libraries


## Longer-term METR Predictions

In [2]:
# Use custom starting task length for HACCA

from model_params import elicitation_boost, inference_compute_adj, reliability_multiplier, task_type_penalty, messy_tasks_penalty, experience_penalty, capability_area

def generate_baseline_task_length(current_best):
    if print_step_outputs:
        print(f"Current best: {current_best}")

    # Start with current best, add elicitation boost
    start_task_length = current_best * elicitation_boost
    if print_step_outputs:
        print(f"After elicitation boost: {start_task_length}")
    
    # add inference compute adjustment
    start_task_length = start_task_length * inference_compute_adj
    if print_step_outputs:
        print(f"After inference compute adjustment: {start_task_length}")

    # add reliability penalty
    start_task_length = start_task_length * reliability_multiplier
    if print_step_outputs:
        print(f"After reliability penalty: {start_task_length}")

    # Add task type penalty
    start_task_length *= task_type_penalty
    if print_step_outputs:
        print(f"After task type penalty: {start_task_length}")

    # Add messy tasks penalty
    start_task_length *= messy_tasks_penalty
    if print_step_outputs:
        print(f"After messy tasks penalty: {start_task_length}")
    
    # Add experience penalty
    start_task_length *= experience_penalty
    if print_step_outputs:
        print(f"After experience penalty: {start_task_length}")

    # Add a minimum value of 1sec
    start_task_length = sq.dist_max(1 / 60 / 60, start_task_length)
    if print_step_outputs:
        print(f"After applying minimum value of 1 second: {start_task_length}")
    
    return start_task_length

In [3]:
def generate_metr_predictions(ref_model, model_data, doubling_time, n_samples=50000):
    """Generate quarterly METR task horizon predictions for a given reference model."""
    
    reliability_metrics = [
        ('performance_50p', '50%'),
        # ('performance_80p', '80%'),
    ]
    
    baseline_launch_date = model_data[ref_model]['launch_date']
    model_label = ref_model.upper()
    
    def end_of_quarter(year, q):
        return datetime(year, [3, 6, 9, 12][q-1], [31, 30, 30, 31][q-1])
    
    quarters = [
        (year, q, end_of_quarter(year, q))
        for year in range(2025, end_year)
        for q in range(4 if year == 2025 else 1, 5)
    ]
    
    tau_samples = sq.sample(doubling_time, n=n_samples)
    all_results = {}
    
    for reliability_metric, reliability_label in reliability_metrics:
        # Get baseline start task length for the reference model
        baseline_task_length = model_data[ref_model][reliability_metric]
        
        print(f"METR Task Horizon Predictions (Reliability: {reliability_label},  Model: {model_label}, Capability Area: {capability_area})")
        print("=" * 70)
        print(f"Baseline: {model_label} at {baseline_task_length:.1f}hr on {baseline_launch_date.strftime('%Y-%m-%d')}")

        # Generating updated baseline task length with modifiers and penalties
        if apply_starting_task_length_modifiers:
            baseline_task_length_dist = generate_baseline_task_length(baseline_task_length)
            baseline_task_length_samples = sq.sample(baseline_task_length_dist, n=n_samples)
            if print_step_outputs:
                print(f"\nBaseline distribution w. penalties: {baseline_task_length_dist}\n\nBaseline samples: {baseline_task_length_samples}")
        
        print("=" * 70)
        print()
        print(f"{'Quarter':<8} {'End Date':<12} {'Mean':<10} {'Median':<10} {'90% CI':<25} {'Days from baseline'}")
        print("-" * 85)
        
        results = []
        for year, q, quarter_date in quarters:
            # Calculate days from baseline and no. of doublings
            days_from_baseline = (quarter_date - baseline_launch_date).days
            if days_from_baseline < 0:
                continue
            
            doublings = days_from_baseline / tau_samples
            if apply_starting_task_length_modifiers:
                task_lengths = baseline_task_length_samples * (2 ** doublings)
            else:
                task_lengths = baseline_task_length * (2 ** doublings)
            
            # Calculate task length mean/median and 90% CI
            mean_val, median_val = np.mean(task_lengths), np.median(task_lengths)
            p5, p95 = np.percentile(task_lengths, 5), np.percentile(task_lengths, 95)
            
            ci_str = f"[{fmt_worktime(p5)} - {fmt_worktime(p95)}]"
            print(f"{year}Q{q:<7} {quarter_date.strftime('%Y-%m-%d'):<12} {fmt_worktime(mean_val):<10} {fmt_worktime(median_val):<10} {ci_str:<25} {days_from_baseline:>4}")
            
            results.append({
                "quarter": f"{year}Q{q}", "date": quarter_date,
                "mean": mean_val, "median": median_val, "p5": p5, "p95": p95,
                "days": days_from_baseline, "task_lengths": task_lengths,
            })
            if q == 4:
                print()
        
        # Display key milestones
        print("\n" + "=" * 70)
        print(f"Key Milestones (Median Estimates) - {model_label} ({reliability_label})")
        print("=" * 70)
        
        milestones = [(2, "2 hours"), (4, "4 hours"), (8, "1 day"), (16, "2 days"),
                      (24, "3 days"), (40, "1 week"), (174, "1 month"), (520, "3 months"), (1040, "6 months"),]
        
        for hours, name in milestones:
            for r in results:
                if r["median"] >= hours:
                    prob = np.mean(r["task_lengths"] >= hours) * 100
                    print(f"{name:<12} → {r['quarter']} ({r['date'].strftime('%B %d, %Y')}, {prob:.0f}% of samples)")
                    break
            else:
                print(f"{name:<12} → Not reached by {end_year}Q4")
        
        # Display the 90% CI for the key milestones in terms of the quarters when they are reached
        print("\n" + "=" * 70)
        print("90% Confidence Intervals for Key Milestones - PUBLIC MODELS")
        print("=" * 70)
        for hours, name in milestones:
            p5_date = None
            p95_date = None
            for r in results:
                if r["p5"] >= hours and p5_date is None:
                    # p5_date = r["date"].strftime("%B %d, %Y")
                    p5_date = r["quarter"]
                if r["p95"] >= hours and p95_date is None:
                    # p95_date = r["date"].strftime("%B %d, %Y")
                    p95_date = r["quarter"]
                if p5_date is not None and p95_date is not None:
                    break
            if p5_date is None:
                p5_date = f"Not reached by {end_year}Q4"
            if p95_date is None:
                p95_date = f"Not reached by {end_year}Q4"
            print(f"{name:<12} → [{p95_date} - {p5_date}]")
        
        all_results[reliability_metric] = results
        print("\n\n")
    
    return all_results

results_gpt5 = generate_metr_predictions('gpt5.1-codex-max', model_data, doubling_time)

METR Task Horizon Predictions (Reliability: 50%,  Model: GPT5.1-CODEX-MAX, Capability Area: General)
Baseline: GPT5.1-CODEX-MAX at 2.9hr on 2025-11-19

Quarter  End Date     Mean       Median     90% CI                    Days from baseline
-------------------------------------------------------------------------------------
2025Q4       2025-12-31   1.2hr      8min       [0min - 4.0hr]              42

2026Q1       2026-03-31   1.8hr      12min      [0min - 5.8hr]             132
2026Q2       2026-06-30   2.7hr      17min      [0min - 1.1d]              223
2026Q3       2026-09-30   4.3hr      25min      [0min - 1.6d]              315
2026Q4       2026-12-31   7.0hr      35min      [0min - 2.5d]              407

2027Q1       2027-03-31   1.4d       50min      [0min - 3.8d]              497
2027Q2       2027-06-30   2.5d       1.2hr      [0min - 1.2wk]             588
2027Q3       2027-09-30   4.5d       1.7hr      [0min - 1.9wk]             680
2027Q4       2027-12-31   1.7wk      2.

In [4]:
results_claude_4p5_opus = generate_metr_predictions('claude_4p5_opus', model_data, doubling_time)

METR Task Horizon Predictions (Reliability: 50%,  Model: CLAUDE_4P5_OPUS, Capability Area: General)
Baseline: CLAUDE_4P5_OPUS at 4.8hr on 2025-11-24

Quarter  End Date     Mean       Median     90% CI                    Days from baseline
-------------------------------------------------------------------------------------
2025Q4       2025-12-31   2.0hr      14min      [0min - 6.8hr]              37

2026Q1       2026-03-31   2.9hr      20min      [0min - 1.2d]              127
2026Q2       2026-06-30   4.3hr      28min      [0min - 1.8d]              218
2026Q3       2026-09-30   6.5hr      40min      [0min - 2.7d]              310
2026Q4       2026-12-31   1.3d       58min      [0min - 4.1d]              402

2027Q1       2027-03-31   2.1d       1.4hr      [0min - 1.3wk]             492
2027Q2       2027-06-30   3.5d       1.9hr      [0min - 2.0wk]             583
2027Q3       2027-09-30   1.2wk      2.8hr      [1min - 3.1wk]             675
2027Q4       2027-12-31   2.4wk      3.9h

In [4]:
plot_exponential_growth(
    doubling_time_days=doubling_time,
    start_hours=baseline_task_length,
    start_date=baseline_launch_date,
    agi_task_length=100_000,
    shift=0,
    n_quarters=24,
    n_samples=100_000,
    n_traces=200,
    max_task_power=10,
)

NameError: name 'baseline_task_length' is not defined

## Shorter-term predictions

#### Predict Claude 4 Opus

In [5]:
def predict(from_model, for_model, debug=False, verbose=True):
    if isinstance(from_model, str):
        from_model = model_data[from_model]
    if isinstance(for_model, str):
        for_model = model_data[for_model]

    if verbose:
        print(f"PREDICT {from_model['name']} -> {for_model['name']}")
    days_since = (for_model['launch_date'] - from_model['launch_date']).days

    def extrapolation():
        individual_model_idiosyncratic_variation = sq.norm(0.85, 1/0.85, lclip=0.1)
        doublings = days_since / doubling_time
        return from_model['performance_50p'] * (2**doublings) * individual_model_idiosyncratic_variation

    if debug:
        return {'from_model': from_model,
                'for_model': for_model,
                'days_since': days_since,
                'doubling_time': doubling_time,
                'doublings': days_since / doubling_time,
                'from_perf': from_model['performance_50p']}
    else:
        samples = sq.sample(extrapolation, n=50_000, verbose=True)
        mean_ci = sq.get_mean_and_ci(samples, credibility=80)
        if verbose:
            print(f"Predicted: {fmt_worktime(mean_ci['mean'])} (80%CI: {fmt_worktime(mean_ci['ci_low'])} - {fmt_worktime(mean_ci['ci_high'])})")
    
        if for_model.get('performance_50p') and verbose:
            print(f"Actual observed: {for_model['name']} at {for_model['performance_50p']:.1f}hr")
        return samples

_ = predict(from_model='o3', for_model='claude_4_opus')

PREDICT o3 -> Claude 4 Opus


100%|██████████| 50000/50000 [00:03<00:00, 12889.29it/s]
100%|██████████| 50000/50000 [00:08<00:00, 5998.84it/s]

Predicted: 1.8hr (80%CI: 1.6hr - 2.1hr)
Actual observed: Claude 4 Opus at 1.4hr





In [6]:
_ = predict(from_model='o3', for_model='gpt5')

PREDICT o3 -> GPT5


100%|██████████| 50000/50000 [00:03<00:00, 13048.46it/s]
100%|██████████| 50000/50000 [00:08<00:00, 5948.49it/s]

Predicted: 2.5hr (80%CI: 2.0hr - 3.1hr)
Actual observed: GPT5 at 2.3hr





In [7]:
_ = predict(from_model='o3', for_model={'name': 'TODAY', 'launch_date': datetime.now()})

PREDICT o3 -> TODAY


100%|██████████| 50000/50000 [00:03<00:00, 13286.47it/s]
100%|██████████| 50000/50000 [00:15<00:00, 3269.96it/s]


Predicted: 5.1hr (80%CI: 3.0hr - 7.9hr)


In [8]:
_ = predict(from_model='gpt5', for_model={'name': 'TODAY', 'launch_date': datetime.now()})

PREDICT GPT5 -> TODAY


100%|██████████| 50000/50000 [00:13<00:00, 3657.50it/s]
100%|██████████| 50000/50000 [00:28<00:00, 1743.11it/s]

Predicted: 4.5hr (80%CI: 3.3hr - 6.0hr)





In [9]:
_ = predict(from_model='o3', for_model='claude_4p5_sonnet')

PREDICT o3 -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:13<00:00, 3742.79it/s]
100%|██████████| 50000/50000 [00:28<00:00, 1760.99it/s]

Predicted: 3.2hr (80%CI: 2.3hr - 4.3hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr





In [10]:
c4o_samples = predict(from_model='claude_4_opus', for_model='claude_4p5_sonnet')

PREDICT Claude 4 Opus -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:13<00:00, 3695.34it/s]
100%|██████████| 50000/50000 [00:27<00:00, 1820.65it/s]

Predicted: 2.5hr (80%CI: 1.9hr - 3.2hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr





In [11]:
c4o_samples = predict(from_model='claude_4p1_opus', for_model='claude_4p5_sonnet')

PREDICT Claude 4.1 Opus -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:13<00:00, 3732.40it/s]
100%|██████████| 50000/50000 [00:16<00:00, 3047.22it/s]

Predicted: 2.4hr (80%CI: 2.0hr - 2.8hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr





In [12]:
gpt5_samples = predict(from_model='gpt5', for_model='claude_4p5_sonnet')

PREDICT GPT5 -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:13<00:00, 3755.29it/s]
100%|██████████| 50000/50000 [00:28<00:00, 1739.72it/s]

Predicted: 2.9hr (80%CI: 2.4hr - 3.3hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr





In [13]:
mixed_samples = np.concatenate([c4o_samples, gpt5_samples])
mean_ci = sq.get_mean_and_ci(mixed_samples, credibility=80)
print(f"Predicted: {fmt_worktime(mean_ci['mean'])} (80%CI: {fmt_worktime(mean_ci['ci_low'])} - {fmt_worktime(mean_ci['ci_high'])})")

Predicted: 2.6hr (80%CI: 2.1hr - 3.2hr)


In [14]:
bins = [0, 1.5, 2, 2.5, 3, 3.5, 4, 5, 6, 7]
labels = ['<1.5h', '1.5h - 2h', '2h - 2.5h', '2.5h - 3h', '3h - 3.5h', 
          '3.5h - 4h', '4h - 5h', '5h - 6h', '6h - 7h']

def multi_predict(from_models, for_model, verbose=True):
    pred_samples = []
    for from_model in from_models:
        pred_samples.append(predict(from_model, for_model, debug=False, verbose=True))
        if verbose:
            print('')
    if verbose:
            print('- MIXED MODEL -')
    pred_samples = np.concatenate(pred_samples)
    mean_ci = sq.get_mean_and_ci(pred_samples, credibility=80)
    
    if verbose:
        print(f"Predicted: {fmt_worktime(mean_ci['mean'])} (80%CI: {fmt_worktime(mean_ci['ci_low'])} - {fmt_worktime(mean_ci['ci_high'])})")

    bins = [0, 1.5, 2, 2.5, 3, 3.5, 4, 5, 6, 7, 99999]
    labels = ['<1.5h', '1.5h - 2h', '2h - 2.5h', '2.5h - 3h', '3h - 3.5h', 
              '3.5h - 4h', '4h - 5h', '5h - 6h', '6h - 7h', '>7h']
    counts, _ = np.histogram(pred_samples, bins=bins)
    percentages = 100 * counts / len(pred_samples)
    if verbose:
        print('')
        for label, pct in zip(labels, percentages):
            print(f"{label} {pct:.1f}%")
            
    if isinstance(for_model, str):
        for_model = model_data[for_model]
    if for_model.get('performance_50p') and verbose:
        print('')
        print(f"Actual observed: {for_model['name']} at {for_model['performance_50p']:.1f}hr")
        
    return pred_samples

_ = multi_predict(from_models=['o3', 'claude_3p7_sonnet', 'claude_3p5_sonnet_new'], for_model='claude_4_opus')

PREDICT o3 -> Claude 4 Opus


  0%|          | 0/50000 [00:00<?, ?it/s]

100%|██████████| 50000/50000 [00:14<00:00, 3542.47it/s]
100%|██████████| 50000/50000 [00:27<00:00, 1825.49it/s]


Predicted: 1.8hr (80%CI: 1.6hr - 2.1hr)
Actual observed: Claude 4 Opus at 1.4hr

PREDICT Claude 3.7 Sonnet -> Claude 4 Opus


100%|██████████| 50000/50000 [00:13<00:00, 3671.67it/s]
100%|██████████| 50000/50000 [00:27<00:00, 1845.03it/s]


Predicted: 1.3hr (80%CI: 1.1hr - 1.6hr)
Actual observed: Claude 4 Opus at 1.4hr

PREDICT Claude 3.5 Sonnet (new) -> Claude 4 Opus


100%|██████████| 50000/50000 [00:09<00:00, 5332.25it/s] 
100%|██████████| 50000/50000 [00:07<00:00, 6356.14it/s]

Predicted: 1.2hr (80%CI: 48min - 1.8hr)
Actual observed: Claude 4 Opus at 1.4hr

- MIXED MODEL -
Predicted: 1.5hr (80%CI: 57min - 2.0hr)

<1.5h 55.1%
1.5h - 2h 36.0%
2h - 2.5h 8.0%
2.5h - 3h 0.5%
3h - 3.5h 0.2%
3.5h - 4h 0.1%
4h - 5h 0.1%
5h - 6h 0.0%
6h - 7h 0.0%
>7h 0.0%

Actual observed: Claude 4 Opus at 1.4hr





#### Predict GPT5

In [15]:
_ = multi_predict(from_models=['o1', 'o3'], for_model='gpt5')

PREDICT o1 -> GPT5


100%|██████████| 50000/50000 [00:03<00:00, 13589.07it/s]
100%|██████████| 50000/50000 [00:08<00:00, 5981.24it/s]


Predicted: 2.0hr (80%CI: 1.2hr - 2.9hr)
Actual observed: GPT5 at 2.3hr

PREDICT o3 -> GPT5


100%|██████████| 50000/50000 [00:03<00:00, 13143.12it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6258.78it/s]

Predicted: 2.5hr (80%CI: 2.0hr - 3.2hr)
Actual observed: GPT5 at 2.3hr

- MIXED MODEL -
Predicted: 2.2hr (80%CI: 1.3hr - 3.1hr)

<1.5h 16.7%
1.5h - 2h 22.3%
2h - 2.5h 30.8%
2.5h - 3h 18.5%
3h - 3.5h 6.8%
3.5h - 4h 2.4%
4h - 5h 1.6%
5h - 6h 0.4%
6h - 7h 0.2%
>7h 0.2%

Actual observed: GPT5 at 2.3hr





#### Predict Claude 4.5 Sonnet

In [16]:
_ = multi_predict(from_models=['gpt5', 'claude_4_opus', 'claude_4p1_opus'], for_model='claude_4p5_sonnet')

PREDICT GPT5 -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:03<00:00, 13939.72it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6410.70it/s]


Predicted: 2.9hr (80%CI: 2.4hr - 3.3hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr

PREDICT Claude 4 Opus -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:03<00:00, 13990.76it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6343.81it/s]


Predicted: 2.5hr (80%CI: 1.9hr - 3.2hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr

PREDICT Claude 4.1 Opus -> Claude 4.5 Sonnet


100%|██████████| 50000/50000 [00:03<00:00, 14043.38it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6498.63it/s]


Predicted: 2.4hr (80%CI: 2.0hr - 2.8hr)
Actual observed: Claude 4.5 Sonnet at 2.0hr

- MIXED MODEL -
Predicted: 2.6hr (80%CI: 2.0hr - 3.2hr)

<1.5h 0.1%
1.5h - 2h 8.4%
2h - 2.5h 38.4%
2.5h - 3h 35.9%
3h - 3.5h 13.6%
3.5h - 4h 2.6%
4h - 5h 0.8%
5h - 6h 0.1%
6h - 7h 0.0%
>7h 0.0%

Actual observed: Claude 4.5 Sonnet at 2.0hr


#### Predict GPT5.1-Codex-Max

In [17]:
_ = multi_predict(from_models=['o1', 'gpt5'], for_model='gpt5.1-codex-max')

PREDICT o1 -> GPT5.1-Codex-Max


100%|██████████| 50000/50000 [00:03<00:00, 14230.67it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6342.07it/s]


Predicted: 3.2hr (80%CI: 1.5hr - 5.3hr)
Actual observed: GPT5.1-Codex-Max at 2.9hr

PREDICT GPT5 -> GPT5.1-Codex-Max


100%|██████████| 50000/50000 [00:03<00:00, 14218.35it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6390.07it/s]

Predicted: 3.5hr (80%CI: 2.8hr - 4.4hr)
Actual observed: GPT5.1-Codex-Max at 2.9hr

- MIXED MODEL -
Predicted: 3.4hr (80%CI: 1.8hr - 4.6hr)

<1.5h 4.1%
1.5h - 2h 10.5%
2h - 2.5h 10.7%
2.5h - 3h 16.0%
3h - 3.5h 22.6%
3.5h - 4h 16.3%
4h - 5h 12.4%
5h - 6h 3.4%
6h - 7h 1.4%
>7h 2.6%

Actual observed: GPT5.1-Codex-Max at 2.9hr





#### Predict Gemini 3

In [18]:
_ = multi_predict(from_models=['gpt5', 'claude_4_opus', 'gemini_2p5_pro'], for_model='gemini_3')

PREDICT GPT5 -> Gemini 3


100%|██████████| 50000/50000 [00:03<00:00, 14345.74it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6459.94it/s]


Predicted: 3.5hr (80%CI: 2.8hr - 4.4hr)

PREDICT Claude 4 Opus -> Gemini 3


100%|██████████| 50000/50000 [00:03<00:00, 14366.80it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6537.89it/s]


Predicted: 3.1hr (80%CI: 2.1hr - 4.2hr)

PREDICT Gemini 2.5 Pro -> Gemini 3


100%|██████████| 50000/50000 [00:03<00:00, 14299.86it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6408.98it/s]

Predicted: 1.3hr (80%CI: 57min - 1.8hr)

- MIXED MODEL -
Predicted: 2.6hr (80%CI: 1.1hr - 4.0hr)

<1.5h 25.1%
1.5h - 2h 7.8%
2h - 2.5h 10.3%
2.5h - 3h 16.0%
3h - 3.5h 18.1%
3.5h - 4h 12.0%
4h - 5h 8.1%
5h - 6h 1.6%
6h - 7h 0.5%
>7h 0.3%





#### Predict Claude 4.5 Opus

In [19]:
_ = multi_predict(from_models=['claude_4_opus', 'claude_4p1_opus', 'claude_4p5_sonnet'], for_model='claude_4p5_opus')

PREDICT Claude 4 Opus -> Claude 4.5 Opus


100%|██████████| 50000/50000 [00:03<00:00, 13831.15it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6399.73it/s]


Predicted: 3.1hr (80%CI: 2.2hr - 4.4hr)
Actual observed: Claude 4.5 Opus at 4.8hr

PREDICT Claude 4.1 Opus -> Claude 4.5 Opus


100%|██████████| 50000/50000 [00:03<00:00, 13260.27it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6500.78it/s]


Predicted: 3.0hr (80%CI: 2.4hr - 3.8hr)
Actual observed: Claude 4.5 Opus at 4.8hr

PREDICT Claude 4.5 Sonnet -> Claude 4.5 Opus


100%|██████████| 50000/50000 [00:03<00:00, 14298.66it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6434.88it/s]

Predicted: 2.6hr (80%CI: 2.2hr - 3.0hr)
Actual observed: Claude 4.5 Opus at 4.8hr

- MIXED MODEL -
Predicted: 2.9hr (80%CI: 2.2hr - 3.8hr)

<1.5h 0.0%
1.5h - 2h 2.5%
2h - 2.5h 26.5%
2.5h - 3h 38.0%
3h - 3.5h 18.5%
3.5h - 4h 7.5%
4h - 5h 5.0%
5h - 6h 1.3%
6h - 7h 0.4%
>7h 0.3%

Actual observed: Claude 4.5 Opus at 4.8hr





#### Predict GPT5.2

In [20]:
_ = multi_predict(from_models=['gpt5', 'gpt5.1-codex-max', 'claude_4p1_opus'], for_model='gpt5.2')

PREDICT GPT5 -> GPT 5.2


100%|██████████| 50000/50000 [00:03<00:00, 14244.32it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6506.25it/s]


Predicted: 3.9hr (80%CI: 3.0hr - 4.9hr)

PREDICT GPT5.1-Codex-Max -> GPT 5.2


100%|██████████| 50000/50000 [00:03<00:00, 14079.52it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6430.57it/s]


Predicted: 3.2hr (80%CI: 2.8hr - 3.6hr)

PREDICT Claude 4.1 Opus -> GPT 5.2


100%|██████████| 50000/50000 [00:03<00:00, 13966.06it/s]
100%|██████████| 50000/50000 [00:07<00:00, 6531.04it/s]

Predicted: 3.3hr (80%CI: 2.5hr - 4.2hr)

- MIXED MODEL -
Predicted: 3.4hr (80%CI: 2.7hr - 4.4hr)

<1.5h 0.0%
1.5h - 2h 0.1%
2h - 2.5h 4.1%
2.5h - 3h 22.7%
3h - 3.5h 36.8%
3.5h - 4h 19.7%
4h - 5h 12.6%
5h - 6h 2.9%
6h - 7h 0.8%
>7h 0.4%





#### Predict GPT-5.2-Codex

In [21]:
_ = multi_predict(from_models=['gpt5', 'gpt5.1-codex-max', 'claude_4p1_opus'], for_model='gpt5.2codex')

PREDICT GPT5 -> GPT 5.2 Codex


100%|██████████| 50000/50000 [00:03<00:00, 12891.75it/s]
100%|██████████| 50000/50000 [00:09<00:00, 5249.68it/s]


Predicted: 4.0hr (80%CI: 3.1hr - 5.2hr)

PREDICT GPT5.1-Codex-Max -> GPT 5.2 Codex


100%|██████████| 50000/50000 [00:12<00:00, 3849.08it/s]
100%|██████████| 50000/50000 [00:26<00:00, 1873.55it/s]


Predicted: 3.3hr (80%CI: 2.8hr - 3.7hr)

PREDICT Claude 4.1 Opus -> GPT 5.2 Codex


100%|██████████| 50000/50000 [00:12<00:00, 3935.23it/s]
100%|██████████| 50000/50000 [00:26<00:00, 1895.47it/s]

Predicted: 3.3hr (80%CI: 2.5hr - 4.3hr)

- MIXED MODEL -
Predicted: 3.5hr (80%CI: 2.8hr - 4.5hr)

<1.5h 0.0%
1.5h - 2h 0.1%
2h - 2.5h 3.2%
2.5h - 3h 19.1%
3h - 3.5h 35.4%
3.5h - 4h 22.2%
4h - 5h 14.6%
5h - 6h 3.7%
6h - 7h 1.1%
>7h 0.6%





#### Predict today

In [22]:
today = {'name': 'TODAY', 'launch_date': datetime.now()}
_ = multi_predict(from_models=['gpt5', 'gpt5.1-codex-max', 'claude_4p1_opus'], for_model=today)

PREDICT GPT5 -> TODAY


100%|██████████| 50000/50000 [00:06<00:00, 7682.97it/s] 
100%|██████████| 50000/50000 [00:08<00:00, 5878.27it/s]


Predicted: 4.5hr (80%CI: 3.3hr - 6.0hr)

PREDICT GPT5.1-Codex-Max -> TODAY


100%|██████████| 50000/50000 [00:03<00:00, 13346.98it/s]
100%|██████████| 50000/50000 [00:08<00:00, 6213.91it/s]


Predicted: 3.7hr (80%CI: 3.1hr - 4.3hr)

PREDICT Claude 4.1 Opus -> TODAY


100%|██████████| 50000/50000 [00:03<00:00, 13242.41it/s]
100%|██████████| 50000/50000 [00:13<00:00, 3575.69it/s]

Predicted: 3.8hr (80%CI: 2.7hr - 5.0hr)

- MIXED MODEL -
Predicted: 4.0hr (80%CI: 3.0hr - 5.2hr)

<1.5h 0.0%
1.5h - 2h 0.0%
2h - 2.5h 1.3%
2.5h - 3h 9.2%
3h - 3.5h 24.4%
3.5h - 4h 27.7%
4h - 5h 25.0%
5h - 6h 7.6%
6h - 7h 2.7%
>7h 2.0%



