In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# --- GPT-5 scores ---
gpt5_tickr = {
    "Comprehensiveness": 0.9437,
    "Insight": 0.9345,
    "Instruction Following": 0.9940,
    "Readability": 0.9835,
    "Overall": 0.9592,
}
gpt5_web = {
    "Comprehensiveness": 0.9310,
    "Insight": 0.9258,
    "Instruction Following": 0.9938,
    "Readability": 0.9845,
    "Overall": 0.9533,
}

# --- Claude Sonnet 4.5 scores ---
sonnet_tickr = {
    "Comprehensiveness": 0.7689,
    "Insight": 0.8500,
    "Instruction Following": 0.9217,
    "Readability": 0.8731,
    "Overall": 0.8583,
}
sonnet_web = {
    "Comprehensiveness": 0.7428,
    "Insight": 0.8210,
    "Instruction Following": 0.8942,
    "Readability": 0.8505,
    "Overall": 0.8305,
}

# --- Claude Opus 4.1 scores ---
opus_tickr = {
    "Comprehensiveness": 0.7845,
    "Insight": 0.8928,
    "Instruction Following": 0.9127,
    "Readability": 0.8722,
    "Overall": 0.8752,
}
opus_web = {
    "Comprehensiveness": 0.7323,
    "Insight": 0.8380,
    "Instruction Following": 0.8812,
    "Readability": 0.8630,
    "Overall": 0.8322,
}

In [None]:
# Dimensions (Readability removed per prior instruction)
dims = ["Comprehensiveness", "Insight", "Instruction Following", "Overall"]
models = ["GPT-5", "Claude Sonnet 4.5", "Claude Opus 4.1"]

def err_reduction(web_scores, tickr_scores, dimensions):
    ew = [1 - web_scores[d] for d in dimensions]
    et = [1 - tickr_scores[d] for d in dimensions]
    return [(w - t) / w * 100 if w != 0 else 0.0 for w, t in zip(ew, et)]

gpt5_err   = err_reduction(gpt5_web,   gpt5_tickr,   dims)
sonnet_err = err_reduction(sonnet_web, sonnet_tickr, dims)
opus_err   = err_reduction(opus_web,   opus_tickr,   dims)

# Transposed layout: x-axis groups are models; legend shows RACE dimensions
data = np.array([gpt5_err, sonnet_err, opus_err])  # shape: (models, dims)
x = np.arange(len(models))
width = 0.2

fig, ax = plt.subplots(figsize=(11, 6), dpi=200)
fig.patch.set_alpha(0.0)              # transparent figure background
ax.set_facecolor((1, 1, 1, 0.0))      # transparent axes background

colors = [
    (1, 0, 0, 0.30),  # Comprehensiveness
    (1, 0, 0, 0.50),  # Insight
    (1, 0, 0, 0.70),  # Instruction Following
    (1, 0, 0, 0.90),  # Overall
]

bars_by_dim = []
for i, dim in enumerate(dims):
    bars = ax.bar(x + (i - 1.5) * width, data[:, i], width, label=dim, color=colors[i])
    bars_by_dim.append(bars)

ax.axhline(0, linewidth=1, color="black")
ax.set_xticks(x)
ax.set_xticklabels(models, fontsize=11)
ax.set_ylabel("Percent (%) Improvement", fontsize=12)
ax.set_title("Error Reduction (Tickr vs Web) Across RACE Dimensions by Model Frontier", pad=25, fontsize=13)
ax.legend(title="RACE Dimension")

# value labels
for bars in bars_by_dim:
    for b in bars:
        h = b.get_height()
        ax.text(b.get_x() + b.get_width()/2, h + 0.5, f"{h:.1f}%", ha="center", va="bottom", fontsize=8)

In [None]:
plt.tight_layout()
plt.show()