## Load data from experiment output

In [1]:
import json
import math
from collections import defaultdict

def process_data(input_data):
    # Initialize a defaultdict to store results for each token count
    token_results = defaultdict(list)
    
    # Process each problem
    for problem in input_data:
        correct_answer = int(problem['correct_answer'])
        results = problem['results']
        
        # Process each token count result
        for n_tokens, result in results.items():
            # Convert answer to int and check accuracy
            try:
                answer = int(result['answer'])
                accurate = answer == correct_answer
            except ValueError:
                accurate = False
                
            # Calculate confidence from logprob
            confidence = math.exp(result['logprob'])
            mean_confidence = math.exp(result['mean_logprob'])
            min_confidence = math.exp(result['min_logprob'])
            
            # Add result to the appropriate token count
            token_results[int(n_tokens)].append({
                'accurate': accurate,
                'sum_confidence': confidence,
                'mean_confidence': mean_confidence,
                'min_confidence': min_confidence,
                'confidence': confidence
            })
    
    # Convert defaultdict to regular dict and sort by token count
    return dict(sorted(token_results.items()))

def load_intermediate_probs(filepath):
    with open(filepath, 'r') as f:
        intermediate_probs = [json.loads(l) for l in f.readlines()]
    for eg in intermediate_probs:
        eg['correct_answer'] = eg['results']['correct_answer']
        del eg['results']['correct_answer']
    return intermediate_probs


In [2]:
# main paper result
import json
inter_prob_dict = {}

filepaths = [
    "../incremental/intermediate/probs_24_r1.jsonl",
    "../incremental/intermediate/probs_25_r1.jsonl"
]

intermediate_probs = [load_intermediate_probs(filepath) for filepath in filepaths]
intermediate_probs =[item for sublist in intermediate_probs for item in sublist]

inter_prob_dict["r1"] = intermediate_probs

filepaths = [
    "../incremental/intermediate/probs_24_s1.jsonl",
    "../incremental/intermediate/probs_25_s1.jsonl"
]
intermediate_probs = [load_intermediate_probs(filepath) for filepath in filepaths]
intermediate_probs = [item for sublist in intermediate_probs for item in sublist]

inter_prob_dict["s1"] = intermediate_probs


In [3]:
simplified = {}
simplified["r1"] = process_data(inter_prob_dict["r1"])
simplified["s1"] = process_data(inter_prob_dict["s1"])

for model, dico in simplified.items():
    simplified[model] = {int(key): value for key, value in dico.items()}


In [4]:
def get_chunk(simplified, indices):
    ret = {}
    for model, data in simplified.items():
        ret[model] = {}
        for k, questions in data.items():
            ret[model][k] = [q for i, q in enumerate(questions) if i in indices]
    return ret


## Results for Report


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import random

def plot_confidence_trajectories(simplified, dataset_key="r1", tracked_indices=None, include_scatter=False, filename=None):
    # Set global font sizes
    plt.rcParams['font.size'] = 28
    plt.rcParams['axes.titlesize'] = 40
    plt.rcParams['axes.labelsize'] = 40
    plt.rcParams['legend.fontsize'] = 30
    plt.rcParams['xtick.labelsize'] = 30
    plt.rcParams['ytick.labelsize'] = 30


    # Parse and prepare data
    data = simplified[dataset_key]
    
    # Lists to store processed data
    timesteps = sorted(data.keys())
    correct_timesteps = []
    incorrect_timesteps = []
    correct_confidences = []
    incorrect_confidences = []
    
    xticks = [500, 1000, 2000, 4000, 8000]
    
    # Dictionary to store trajectories for tracked indices
    trajectories = {idx: {'timesteps': [], 'confidences': [], 'accurate': None} 
                   for idx in (tracked_indices or [])}
    
    # Process data
    for t in timesteps:
        to_iterate = data[t]
        
        for idx, item in enumerate(to_iterate):
            if item['accurate']:
                correct_confidences.append(item['confidence'])
                correct_timesteps.append(t)
            else:
                incorrect_confidences.append(item['confidence'])
                incorrect_timesteps.append(t)
            
            # Store trajectory data for tracked indices
            if tracked_indices and idx in tracked_indices:
                trajectories[idx]['timesteps'].append(t)
                trajectories[idx]['confidences'].append(item['confidence'])
                trajectories[idx]['accurate'] = item['accurate']

    # Create figure and axis
    fig, ax1 = plt.subplots(figsize=(18, 10))

    colors = {
        "correct": "indigo",
        "incorrect": "orange"
    }
    
    # Polynomial fits
    # For correct answers
    correct_poly = np.polyfit(np.log10(correct_timesteps), correct_confidences, 3)
    x_correct = np.logspace(np.log10(500), np.log10(8000), 100)
    y_correct = np.polyval(correct_poly, np.log10(x_correct))
    y_correct = np.array([max(min(y,1),0) for y in y_correct])

    # For incorrect answers
    incorrect_poly = np.polyfit(np.log10(incorrect_timesteps), incorrect_confidences, 3)
    x_incorrect = np.logspace(np.log10(500), np.log10(8000), 100)
    y_incorrect = np.polyval(incorrect_poly, np.log10(x_incorrect))

    if include_scatter:
        ax1.scatter(correct_timesteps, correct_confidences, color=colors['correct'], alpha=0.2)
        ax1.scatter(incorrect_timesteps, incorrect_confidences, color=colors['incorrect'], alpha=0.1)


    # Plot polynomial fits
    ax1.plot(x_correct, y_correct, color=colors['correct'], linewidth=4, 
             label='Average Confidence (Correct)', linestyle='-')
    ax1.plot(x_incorrect, y_incorrect, color=colors['incorrect'], linewidth=4, 
             label='Average Confidence (Incorrect)', linestyle='-')

    # Plot trajectories for tracked indices
    for idx, traj in trajectories.items():
        # Plot line segments between consecutive points with appropriate colors
        for i in range(len(traj['timesteps']) - 1):
            # Get colors based on correctness at each timestep
            t1, t2 = traj['timesteps'][i], traj['timesteps'][i+1]
            c1, c2 = traj['confidences'][i], traj['confidences'][i+1]
            correct1 = any(t1 == ct and c1 == cc for ct, cc in zip(correct_timesteps, correct_confidences))
            correct2 = any(t2 == ct and c2 == cc for ct, cc in zip(correct_timesteps, correct_confidences))
            
            # Draw line segment
            if correct1 == correct2:  # Same color for segment
                color = colors['correct'] if correct1 else colors['incorrect']
                ax1.plot([t1, t2], [c1, c2], 
                        color=color, linestyle=':', linewidth=2)
            else:  # Color gradient for transition
                # Plot two halves of the segment with different colors
                mid_t = np.sqrt(t1 * t2)  # Geometric mean for log scale
                mid_c = (c1 + c2) / 2
                color1 = colors['correct'] if correct1 else colors['incorrect']
                color2 = colors['correct'] if correct2 else colors['incorrect']
                ax1.plot([t1, mid_t], [c1, mid_c], 
                        color=color1, linestyle=':', linewidth=2)
                ax1.plot([mid_t, t2], [mid_c, c2], 
                        color=color2, linestyle=':', linewidth=2)
        
        # Add label for the trajectory in legend using the final state color
        final_correct = any(traj['timesteps'][-1] == ct and traj['confidences'][-1] == cc 
                          for ct, cc in zip(correct_timesteps, correct_confidences))
        color = colors['correct'] if final_correct else colors['incorrect']
        # ax1.plot([], [], color=color, linestyle=':', linewidth=2,
        #         label=f'Answer {idx} trajectory')
        ax1.plot([], [], color=color, linestyle=':', linewidth=2)

    # Customize axis
    ax1.set_xlabel('Compute Budget', labelpad=15)
    ax1.set_ylabel('Confidence', labelpad=15)
    ax1.grid(True, alpha=0.3)
    ax1.set_xlim(500, 8000)
    plt.xscale('log')

    ax1.set_xticks(xticks)
    ax1.set_xticklabels([str(x) for x in xticks])

    # Set legend
    ax1.legend(loc='lower right')

    # Adjust layout
    plt.tight_layout()
    if filename:
        plt.savefig(filename)
    else:
        plt.show()


tracked_indices = random.sample(range(0, 60), 10)
chunk = range(0, 60)
plot_confidence_trajectories(get_chunk(simplified, chunk), dataset_key="r1", tracked_indices=tracked_indices, include_scatter=True, filename="confidence_r1.pdf")  # Track answers 0 and 5
plot_confidence_trajectories(get_chunk(simplified, chunk), dataset_key="s1", tracked_indices=tracked_indices, include_scatter=True, filename="confidence_s1.pdf")  # Track answers 0 and 5



In [6]:
import matplotlib.pyplot as plt
import numpy as np
import random

def plot_confidence_trajectories_side_by_side(simplified, tracked_indices=None, include_scatter=False, filename=None):
    # Set global font sizes
    plt.rcParams['font.size'] = 32
    plt.rcParams['axes.titlesize'] = 44
    plt.rcParams['axes.labelsize'] = 44
    plt.rcParams['legend.fontsize'] = 34
    plt.rcParams['xtick.labelsize'] = 34
    plt.rcParams['ytick.labelsize'] = 34

    # Create figure with two subplots side by side
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(36, 12), sharey=True)
    
    # Plot for both datasets
    axes = [ax1, ax2]
    dataset_keys = ["r1", "s1"]
    
    colors = {
        "correct": "indigo",
        "incorrect": "orange"
    }
    
    for ax_idx, (ax, dataset_key) in enumerate(zip(axes, dataset_keys)):
        # Parse and prepare data
        data = simplified[dataset_key]
        
        # Lists to store processed data
        timesteps = sorted(data.keys())
        correct_timesteps = []
        incorrect_timesteps = []
        correct_confidences = []
        incorrect_confidences = []
        
        xticks = [500, 1000, 2000, 4000, 8000]
        
        # Dictionary to store trajectories for tracked indices
        trajectories = {idx: {'timesteps': [], 'confidences': [], 'accurate': None} 
                       for idx in (tracked_indices or [])}
        
        # Process data
        for t in timesteps:
            to_iterate = data[t]
            
            for idx, item in enumerate(to_iterate):
                if item['accurate']:
                    correct_confidences.append(item['confidence'])
                    correct_timesteps.append(t)
                else:
                    incorrect_confidences.append(item['confidence'])
                    incorrect_timesteps.append(t)
                
                # Store trajectory data for tracked indices
                if tracked_indices and idx in tracked_indices:
                    trajectories[idx]['timesteps'].append(t)
                    trajectories[idx]['confidences'].append(item['confidence'])
                    trajectories[idx]['accurate'] = item['accurate']

        # Polynomial fits
        # For correct answers
        correct_poly = np.polyfit(np.log10(correct_timesteps), correct_confidences, 3)
        x_correct = np.logspace(np.log10(500), np.log10(8000), 100)
        y_correct = np.polyval(correct_poly, np.log10(x_correct))
        y_correct = np.array([max(min(y,1),0) for y in y_correct])

        # For incorrect answers
        incorrect_poly = np.polyfit(np.log10(incorrect_timesteps), incorrect_confidences, 3)
        x_incorrect = np.logspace(np.log10(500), np.log10(8000), 100)
        y_incorrect = np.polyval(incorrect_poly, np.log10(x_incorrect))

        if include_scatter:
            ax.scatter(correct_timesteps, correct_confidences, color=colors['correct'], alpha=0.2)
            ax.scatter(incorrect_timesteps, incorrect_confidences, color=colors['incorrect'], alpha=0.1)

        # Plot polynomial fits
        ax.plot(x_correct, y_correct, color=colors['correct'], linewidth=4, 
                 label='Confidence (Correct)', linestyle='-')
        ax.plot(x_incorrect, y_incorrect, color=colors['incorrect'], linewidth=4, 
                 label='Confidence (Incorrect)', linestyle='-')

        # Plot trajectories for tracked indices
        for idx, traj in trajectories.items():
            # Plot line segments between consecutive points with appropriate colors
            for i in range(len(traj['timesteps']) - 1):
                # Get colors based on correctness at each timestep
                t1, t2 = traj['timesteps'][i], traj['timesteps'][i+1]
                c1, c2 = traj['confidences'][i], traj['confidences'][i+1]
                correct1 = any(t1 == ct and c1 == cc for ct, cc in zip(correct_timesteps, correct_confidences))
                correct2 = any(t2 == ct and c2 == cc for ct, cc in zip(correct_timesteps, correct_confidences))
                
                # Draw line segment
                if correct1 == correct2:  # Same color for segment
                    color = colors['correct'] if correct1 else colors['incorrect']
                    ax.plot([t1, t2], [c1, c2], 
                            color=color, linestyle=':', linewidth=2)
                else:  # Color gradient for transition
                    # Plot two halves of the segment with different colors
                    mid_t = np.sqrt(t1 * t2)  # Geometric mean for log scale
                    mid_c = (c1 + c2) / 2
                    color1 = colors['correct'] if correct1 else colors['incorrect']
                    color2 = colors['correct'] if correct2 else colors['incorrect']
                    ax.plot([t1, mid_t], [c1, mid_c], 
                            color=color1, linestyle=':', linewidth=2)
                    ax.plot([mid_t, t2], [mid_c, c2], 
                            color=color2, linestyle=':', linewidth=2)
            
            # Add label for the trajectory in legend using the final state color
            final_correct = any(traj['timesteps'][-1] == ct and traj['confidences'][-1] == cc 
                              for ct, cc in zip(correct_timesteps, correct_confidences))
            color = colors['correct'] if final_correct else colors['incorrect']
            ax.plot([], [], color=color, linestyle=':', linewidth=2)

        # Customize axis
        ax.grid(True, alpha=0.3)
        ax.set_xlim(500, 8000)
        ax.set_xscale('log')
        ax.set_xticks(xticks)
        ax.set_xticklabels([str(x) for x in xticks])
        # Add title for each subplot
        ax.set_title(f"{dataset_key.upper()}-32B")
        
        # Only add legend to the right plot
        if ax_idx == 1:
            ax.legend(loc='lower right')

    # Only add Y axis label to the leftmost plot
    ax1.set_ylabel('Confidence', labelpad=15)
    
    # Add single X axis label between the two plots
    fig.text(0.5, 0.02, 'Compute Budget', ha='center', fontsize=50)
    
    # Adjust layout
    plt.tight_layout()
    plt.subplots_adjust(bottom=0.15)  # Make room for the centered x-label
    
    if filename:
        plt.savefig(filename)
    else:
        plt.show()


# Usage example:
tracked_indices = random.sample(range(0, 60), 10)
chunk = range(0, 60)

# Single call to create both plots side by side
plot_confidence_trajectories_side_by_side(
    get_chunk(simplified, chunk), 
    tracked_indices=tracked_indices, 
    include_scatter=True, 
    filename="confidence_combined.pdf"
)

In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

plt.rcParams['font.size'] = 20
plt.rcParams['legend.fontsize'] = 16  # Legend font size
plt.rcParams['axes.titlesize'] = 28
plt.rcParams['axes.labelsize'] = 28
plt.rcParams['xtick.labelsize'] = 20
plt.rcParams['ytick.labelsize'] = 20


odds = "Jeopardy"

oddsmap = {
    "Exam": 0,
    "Jeopardy": 1,
    "High-Stakes": 20
}

models = ['r1', 's1']#, 's1.1']

chunk = range(0, 60)

# Load and parse the JSON data
data = get_chunk(simplified, chunk)

def calculate_performance(responses, confidence_threshold, penalty_factor=1):
    correct = sum(1 for r in responses if r['accurate'] and r['confidence'] >= confidence_threshold)
    incorrect = sum(1 for r in responses if not r['accurate'] and r['confidence'] >= confidence_threshold)
    # Avoid division by zero
    if incorrect == 0:
        return correct if correct > 0 else 0
    score = correct - penalty_factor * incorrect
    return score# / 30

# Get compute steps (x-axis values)
compute_steps = sorted([int(k) for k in data['r1'].keys()])

# Calculate performance for both models at different thresholds
thresholds = [0.0, 0.95]
performance_data = {
    model: {threshold: [] for threshold in thresholds} for model in models
}

for model in models:
    for threshold in thresholds:
        for step in compute_steps:
            perf = calculate_performance(data[model][step], threshold, penalty_factor=oddsmap[odds])
            performance_data[model][threshold].append(perf / 30)

# Create the plot
plt.figure(figsize=(12, 6))

# colors = {'r1': ['#ff7f0e', '#ff7f0e'], 's1': ['#1f77b4', '#1f77b4']}
colors = {'r1': ['blue', 'blue'], 's1': ['red', 'red'], 's1.1': ['green', 'green']}

styles = {
    str(thresholds[0]): '-',
    str(thresholds[1]): '--'
}
labels = {'r1': 'R1-32B', 's1': 's1-32B', 's1.1': 's1.1-32B'}

for model in models:
    for i, threshold in enumerate(thresholds):
        plt.plot(compute_steps, performance_data[model][threshold], 
                 label=f'{labels[model]} (threshold={threshold})',
                 color=colors[model][i],
                 linestyle=styles[str(threshold)],
                 linewidth=2)

plt.xlabel('Compute Budget')
plt.ylabel(f'Utility ({odds})')
# plt.title('Model Performance at Different Confidence Thresholds')
plt.grid(True, linestyle='--', alpha=0.7)
plt.legend()
plt.xscale('log')

plt.xticks([500, 1000, 2000, 4000, 8000], ['500', '1000', '2000', '4000', '8000'])
plt.xlim(500, 8000)
# Add a small offset to zero values for log scale
for model in models:
    for threshold in thresholds:
        performance_data[model][threshold] = [max(x, 1e-10) for x in performance_data[model][threshold]]

plt.tight_layout()
plt.show()

In [None]:
thresholds = [0, 0.5, 0.95]
token_limits = list(range(500, 8001, 500))
xticks = [500, 1000, 2000, 4000, 8000]

plt.rcParams['font.size'] = 20  # Base font size
plt.rcParams['axes.titlesize'] = 30  # Title font size
plt.rcParams['axes.labelsize'] = 24  # Axis label font size
plt.rcParams['legend.fontsize'] = 20  # Legend font size
plt.rcParams['xtick.labelsize'] = 16  # X-axis tick labels
plt.rcParams['ytick.labelsize'] = 16  # Y-axis tick labels
plt.rcParams["figure.dpi"] = 300


# Create figure with three subplots
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Define colors and markers for each dataset
styles = {
    'r1': {'color': 'blue', 'marker': 'o', 'label': 'R1'},
    's1': {'color': 'red', 'marker': 's', 'label': 'S1'},
}

chunk = range(0, 60)

data = get_chunk(simplified, chunk)

models = ['r1', 's1']

# For each threshold
for idx, threshold in enumerate(thresholds):
    # For each dataset (R1 and S1)
    for dataset_name, dataset in [(model, data[model]) for model in models]:
        accuracies = []
        answered_proportions = []
        
        # For each token limit
        for token_limit in token_limits:
            entries = dataset[token_limit]
            
            # Filter entries above confidence threshold
            filtered = [(e["accurate"], e["confidence"]) for e in entries if e["confidence"] >= threshold]
            
            if filtered:
                accuracy = sum(x[0] for x in filtered) / len(filtered)
                proportion_answered = len(filtered) / len(entries)
            else:
                accuracy = 0
                proportion_answered = 0
                
            accuracies.append(accuracy)
            answered_proportions.append(proportion_answered)
        
        
        # Plot accuracy with dataset-specific styling
        style = styles[dataset_name]
        axes[idx].plot(token_limits, accuracies, 
                      color=style['color'], 
                      marker=style['marker'], 
                      label=style['label'])
    
    # Configure subplot
    axes[idx].set_title(f'Confidence Threshold = {threshold}', fontsize=18)
    fig.text(0.5, 0, 'Compute Budget', ha='center', fontsize=18)

    if idx == 0:
        axes[idx].set_ylabel('Accuracy of Answered Questions', fontsize=16)
        axes[idx].legend(loc='upper left')

    else:
        axes[idx].set_yticklabels([])

    axes[idx].grid(True, linestyle='--', alpha=0.7)
    
    # Set x-axis to log scale
    axes[idx].set_xscale('log')
    axes[idx].xaxis.set_minor_locator(plt.NullLocator())

    xticks = [500, 1000, 2000, 4000, 8000]
    axes[idx].set_xticks(xticks)
    axes[idx].set_xticklabels([str(x) for x in xticks])
    axes[idx].set_xbound(500, 8000)

    # Set y-axis limits
    axes[idx].set_ylim(-0.02, 1.02)

plt.show()

In [9]:
# same as above, but for reward
penalty_factor = 1 #1
exhaustive = True

import json
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata

chunk = range(0, 60)

model = "r1" #s1
data = get_chunk(simplified, chunk)[model]


plt.rcParams['font.size'] = 20  # Base font size
plt.rcParams['axes.titlesize'] = 30  # Title font size
plt.rcParams['axes.labelsize'] = 24  # Axis label font size
plt.rcParams['legend.fontsize'] = 20  # Legend font size
plt.rcParams['xtick.labelsize'] = 14  # X-axis tick labels
plt.rcParams['ytick.labelsize'] = 14  # Y-axis tick labels
# Create arrays for the plot
confidence_thresholds = np.linspace(0, 1, 50)  # 50 points between 0 and 1
exp_times = [500, 1000, 2000, 4000, 8000]#,  16000]  # token limits
log_exp_times = np.log2(exp_times)  # Convert to log scale

times = list(range(500, 8001, 500))

log_times = np.log2(times)  # Convert to log scale

# Create a finer mesh for smooth interpolation
X_smooth = np.linspace(confidence_thresholds.min(), confidence_thresholds.max(), 200)
Y_smooth = np.linspace(log_times.min(), log_times.max(), 200)
X_smooth, Y_smooth = np.meshgrid(X_smooth, Y_smooth)

# Original grid
X, Y = np.meshgrid(confidence_thresholds, log_times)
Z_accuracy = np.zeros_like(X)
Z_answered = np.zeros_like(X)

# Calculate accuracy and percent answered for each combination
for i, time in enumerate(times):
    key = time
    entries = data[key]
    
    for j, threshold in enumerate(confidence_thresholds):
        if not exhaustive:
            propagated = propagate_confident_results(data, threshold)
            entries = propagated[key]

        filtered = [(e["accurate"], e["confidence"]) for e in entries if e["confidence"] >= threshold]
        if filtered:
            accuracies = [x[0] for x in filtered]
            Z_accuracy[i,j] = (sum(accuracies) - penalty_factor * sum([1-a for a in accuracies])) / len(entries)
            Z_answered[i,j] = len(filtered) / len(entries)
        else:
            Z_accuracy[i,j] = 0
            Z_answered[i,j] = 0

# Interpolate the data onto the finer mesh
points = np.column_stack((X.flatten(), Y.flatten()))
Z_accuracy_smooth = griddata(points, Z_accuracy.flatten(), (X_smooth, Y_smooth), method='cubic')
Z_answered_smooth = griddata(points, Z_answered.flatten(), (X_smooth, Y_smooth), method='cubic')

# Create the combined plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot smoothed surface with accuracy as height and percent answered as color
norm = plt.Normalize(Z_answered_smooth.min(), Z_answered_smooth.max())
colormap = plt.cm.RdYlBu_r
colors = colormap(norm(Z_answered_smooth))

surf = ax.plot_surface(X_smooth, Y_smooth, Z_accuracy_smooth, 
                      facecolors=colors,
                      alpha=0.9,
                      antialiased=True)

# Add the two black lines at thresholds 0.0 and 0.95
for threshold in [0.0, 0.95]:
    # Find the indices closest to our desired threshold
    threshold_idx = np.abs(X_smooth[0] - threshold).argmin()
    
    # Extract the coordinates for the line
    line_x = np.full_like(Y_smooth[:,0], threshold)
    line_y = Y_smooth[:,0]
    line_z = Z_accuracy_smooth[:,threshold_idx]
    
    ax.plot(line_x, line_y, line_z, linestyle=(1, (2, 2)), color='k', linewidth=3, zorder=100)
    ax.plot(line_x, line_y, line_z, linestyle=(3, (2, 2)), color='w', linewidth=3, zorder=100)


# Add the purple line along maximum points
max_indices = np.argmax(Z_accuracy_smooth, axis=1)
max_x = X_smooth[0][max_indices]
max_y = Y_smooth[:,0]
max_z = np.array([Z_accuracy_smooth[i,max_indices[i]] for i in range(len(max_indices))])


# Add a color bar
sm = plt.cm.ScalarMappable(cmap=colormap, norm=norm)
sm.set_array([])

# Set labels and title
ax.set_xlabel('\nConfidence Threshold', fontsize=18)
ax.set_ylabel('\nCompute Budget', fontsize=18)
ax.set_zlabel('Utility\n', fontsize=18, labelpad=30)
ax.set_title('Utility', fontsize=18, x=0.04, y=0.82, rotation=16)
# Adjust the view angle for better visualization
ax.view_init(elev=20, azim=225)

# Customize the y-axis ticks to show actual token limits
y_ticks = log_exp_times
y_labels = [str(t) for t in exp_times]

ax.set_yticks(y_ticks)
ax.set_yticklabels(y_labels, fontsize=14)

# Create a light blue plane at z=0 that's only visible above the surface
xx, yy = np.meshgrid(
    np.linspace(X_smooth.min()-0.05, X_smooth.max(), 100),  # Using more points for smoother masking
    np.linspace(Y_smooth.min(), Y_smooth.max(), 100)
)
zz = np.zeros_like(xx)

# Interpolate the surface Z values to match our plane's grid
surface_z = griddata(
    (X_smooth.flatten(), Y_smooth.flatten()), 
    Z_accuracy_smooth.flatten(), 
    (xx, yy), 
    method='cubic'
)

# Create a mask where the plane should be visible (where surface is below 0)
mask = surface_z < 0

# Set the Z values to NaN where we don't want the plane visible
zz_masked = np.where(mask, zz, np.nan)

ax.plot_surface(xx, yy, zz_masked, 
                color='gray',
                alpha=0.5,  # Makes it translucent
                shade=False)  # Disable shading for uniform color

# plt.show()
plt.savefig('utility_s1.pdf')

In [10]:
# same as above, but for reward
penalty_factor = 1 #1
exhaustive = True

import json
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy.interpolate import griddata

chunk = range(0, 60)

model = "r1"
# model = "s1"


data = get_chunk(simplified, chunk)[model]


plt.rcParams['font.size'] = 20  # Base font size
plt.rcParams['axes.titlesize'] = 30  # Title font size
plt.rcParams['axes.labelsize'] = 24  # Axis label font size
plt.rcParams['legend.fontsize'] = 20  # Legend font size
plt.rcParams['xtick.labelsize'] = 14  # X-axis tick labels
plt.rcParams['ytick.labelsize'] = 14  # Y-axis tick labels
# Create arrays for the plot
confidence_thresholds = np.linspace(0, 0.95, 99)  # 50 points between 0 and 1

times = list(range(500, 8001, 500))


exp_times = [500, 1000, 2000, 4000, 8000]  # token limits
log_exp_times = np.log2(exp_times)  # Convert to log scale
log_times = np.log2(times)  # Convert to log scale

# Create a finer mesh for smooth interpolation
X_smooth = np.linspace(confidence_thresholds.min(), confidence_thresholds.max(), 200)

Y_smooth = np.linspace(log_times.min(), log_times.max(), 200)
X_smooth, Y_smooth = np.meshgrid(X_smooth, Y_smooth)

# Original grid
X, Y = np.meshgrid(confidence_thresholds, log_times)
Z_accuracy = np.zeros_like(X)
Z_answered = np.zeros_like(X)

# Calculate accuracy and percent answered for each combination
for i, time in enumerate(times):
    key = time
    entries = data[key]
    
    for j, threshold in enumerate(confidence_thresholds):
        if not exhaustive:
            propagated = propagate_confident_results(data, threshold)
            entries = propagated[key]

        filtered = [(e["accurate"], e["confidence"]) for e in entries if e["confidence"] >= threshold]

        if filtered:
            accuracies = [x[0] for x in filtered]
            Z_accuracy[i,j] = sum(accuracies) / len(accuracies)
            Z_answered[i,j] = len(filtered) / len(entries)
        else:
            Z_accuracy[i,j] = 0 #Z_accuracy[i-1, j]
            Z_answered[i,j] = 0

# Interpolate the data onto the finer mesh
points = np.column_stack((X.flatten(), Y.flatten()))
Z_accuracy_smooth = griddata(points, Z_accuracy.flatten(), (X_smooth, Y_smooth), method='cubic')
# After interpolation, clip values to [0,1] range
Z_accuracy_smooth = np.clip(Z_accuracy_smooth, 0, 1)


Z_answered_smooth = griddata(points, Z_answered.flatten(), (X_smooth, Y_smooth), method='cubic')

# Create the combined plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot smoothed surface with accuracy as height and percent answered as color
norm = plt.Normalize(Z_answered_smooth.min(), Z_answered_smooth.max())
colormap = plt.cm.RdYlBu_r
colors = colormap(norm(Z_answered_smooth))

surf = ax.plot_surface(X_smooth, Y_smooth, Z_accuracy_smooth, 
                      facecolors=colors,
                      alpha=0.9,
                      antialiased=True)

# Add the two black lines at thresholds 0.0 and 0.95
for threshold in [0.0, 0.5, 0.95]:
    # Find the indices closest to our desired threshold
    threshold_idx = np.abs(X_smooth[0] - threshold).argmin()
    
    # Extract the coordinates for the line
    line_x = np.full_like(Y_smooth[:,0], threshold)
    line_y = Y_smooth[:,0]
    line_z = Z_accuracy_smooth[:,threshold_idx]
    
    ax.plot(line_x, line_y, line_z, linestyle=(1, (2, 2)), color='k', linewidth=2, zorder=100)
    ax.plot(line_x, line_y, line_z, linestyle=(3, (2, 2)), color='w', linewidth=2, zorder=100)


# Add the purple line along maximum points
max_indices = np.argmax(Z_accuracy_smooth, axis=1)
max_x = X_smooth[0][max_indices]
max_y = Y_smooth[:,0]
max_z = np.array([Z_accuracy_smooth[i,max_indices[i]] for i in range(len(max_indices))])


# Add a color bar
sm = plt.cm.ScalarMappable(cmap=colormap, norm=norm)
sm.set_array([])
cbar = fig.colorbar(sm, ax=ax, shrink=0.9, anchor=(0, 0.2))  # Lower anchor point moves it down
cbar.set_label('Proportion of Questions Answered', fontsize=18)

# Set labels and title
ax.set_xlabel('\nConfidence Threshold', fontsize=18)
ax.set_ylabel('\nCompute Budget', fontsize=18)
ax.set_title('Accuracy', fontsize=18, x=0.04, y=0.82, rotation=16)
# Adjust the view angle for better visualization
ax.view_init(elev=20, azim=225)

# Customize the y-axis ticks to show actual token limits
y_ticks = log_exp_times
y_labels = [str(t) for t in exp_times]

ax.set_yticks(y_ticks)
ax.set_yticklabels(y_labels, fontsize=14)

# Create a light blue plane at z=0 that's only visible above the surface
xx, yy = np.meshgrid(
    np.linspace(X_smooth.min()-0.05, X_smooth.max(), 100),  # Using more points for smoother masking
    np.linspace(Y_smooth.min(), Y_smooth.max(), 100)
)
zz = np.zeros_like(xx)

# Interpolate the surface Z values to match our plane's grid
surface_z = griddata(
    (X_smooth.flatten(), Y_smooth.flatten()), 
    Z_accuracy_smooth.flatten(), 
    (xx, yy), 
    method='cubic'
)

# Create a mask where the plane should be visible (where surface is below 0)
mask = surface_z < 0

# Set the Z values to NaN where we don't want the plane visible
zz_masked = np.where(mask, zz, np.nan)

plt.tight_layout()

# plt.show()
plt.savefig('accuracy_r1.pdf')

In [11]:
# Previous code remains the same until the colorbar section
times = list(range(500, 8001, 500))
times = list(range(100, 8001, 100))

# Create the combined plot
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')

# Plot smoothed surface with accuracy as height and percent answered as color
norm = plt.Normalize(Z_answered_smooth.min(), Z_answered_smooth.max())
colormap = plt.cm.RdYlBu_r
colors = colormap(norm(Z_answered_smooth))

surf = ax.plot_surface(X_smooth, Y_smooth, Z_accuracy_smooth, 
                      facecolors=colors,
                      alpha=0.9,
                      antialiased=True)

# Add the two black lines at thresholds 0.0 and 0.95
for threshold in [0.0, 0.5, 0.95]:
    threshold_idx = np.abs(X_smooth[0] - threshold).argmin()
    line_x = np.full_like(Y_smooth[:,0], threshold)
    line_y = Y_smooth[:,0]
    line_z = Z_accuracy_smooth[:,threshold_idx]
    
    ax.plot(line_x, line_y, line_z, linestyle=(1, (2, 2)), color='k', linewidth=2, zorder=100)
    ax.plot(line_x, line_y, line_z, linestyle=(3, (2, 2)), color='w', linewidth=2, zorder=100)

# Add the colorbar horizontally at the top
sm = plt.cm.ScalarMappable(cmap=colormap, norm=norm)
sm.set_array([])

# Adjust the position of the colorbar using axes coordinates
cax = fig.add_axes([0.26, 0.80, 0.5, 0.03])  # [left, bottom, width, height]

cbar = fig.colorbar(sm, cax=cax, orientation='horizontal')#,  anchor=(-1, 0))#, shrink=0.5, anchor=(0, 0.2))
cbar.set_label('Proportion of Questions Answered', fontsize=18)
cbar.ax.xaxis.set_label_position('top')
cbar.ax.xaxis.set_ticks_position('top')

# Set labels and title
ax.set_xlabel('\nConfidence Threshold', fontsize=18)
ax.set_ylabel('\nCompute Budget', fontsize=18)
ax.set_title('Accuracy', fontsize=18, x=0.04, y=0.82, rotation=16)

# Adjust the view angle for better visualization
ax.view_init(elev=20, azim=225)

# Customize the y-axis ticks to show actual token limits
y_ticks = log_exp_times
y_labels = [str(t) for t in exp_times]
ax.set_yticks(y_ticks)
ax.set_yticklabels(y_labels, fontsize=14)

# Adjust layout to accommodate the colorbar
plt.subplots_adjust(top=0.85)  # Make room for the colorbar at the top

# plt.show()
plt.savefig('accuracy_s1.pdf')