In [None]:
import pandas as pd
import os


In [None]:
colors = ['#28abaf','#66ccff','#EE0000','#006666','#FFFF00','#0080FF']
import matplotlib.pyplot as plt
p = '/home/kent/fonts/Roboto_Condensed/static'
import os
all_font_flie_names = os.listdir(p)
all_ttf_file_names = [n for n in all_font_flie_names if n.endswith('.ttf')]
from matplotlib.font_manager import fontManager
for ttf_file_name in all_ttf_file_names:
    fontManager.addfont(path=os.path.join(p,ttf_file_name))


import matplotlib.pyplot as plt
plt.rcParams['font.family'] = ['sans-serif'] 
plt.rcParams['font.sans-serif'] = ['Roboto Condensed'] 
plt.rcParams['font.size'] = 13

In [None]:
from config import IDEA_MODELS, CRITIC_MODELS

In [None]:
def clean_model_name(name):
    # Process the path, remove the part before the "/"
    if '/' in name:
        name = name.split('/', 1)[1]  # Split only once, take the second part
    
    # Process specific suffixes
    if ':' in name:
        base_name, suffix = name.split(':', 1)
        # Only remove the suffix when it is a specific type
        if suffix in ['Q4_K_M', 'free']:
            return base_name
        # Otherwise, keep the full name, including the colon and suffix
        return name
    
    return name

In [None]:
df = pd.read_csv('./csvs/view.csv',index_col=0)
df

In [None]:
# # del all rows with anthropic/claude-3-haiku
# df = df[df['idea_model'] != 'anthropic/claude-3-haiku']

In [None]:
# Analyze the average length of each idea_model's ideas, and sort them in descending order
df['idea_length_in_char'] = df['idea'].apply(lambda x: len(str(x)))
# Only ideas with less than 2500 characters are considered valid
df = df[df['idea_length_in_char'] < 2500]
# Only ideas with more than 100 characters are considered valid
df = df[df['idea_length_in_char'] > 100]

df['idea_length_in_words'] = df['idea'].apply(lambda x: len(str(x).split()))
df['idea_length_in_words'].describe()

In [None]:
# If './figs/' does not exist, create it; if it exists, ignore it
os.makedirs('./figs/', exist_ok=True)

In [None]:
# Limit to ideas with < 200 words
df = df[df['idea_length_in_words'] < 200]
# View histogram
import matplotlib.pyplot as plt
plt.figure(figsize=(5,3))
plt.hist(df['idea_length_in_words'], bins=100, color='#28abaf')
plt.xlabel('idea length in words')
plt.ylabel('number of ideas')
plt.tight_layout() # Add this line
plt.savefig('figs/idea_length_in_words.pdf')
plt.show()

In [None]:
# Count how many unique keywords there are
n = df['keywords'].nunique()
print(n)

In [None]:
# Check which keyword (kw) and which model have not been tested
agg = df.groupby(['keywords', 'idea_model']).size()
print(agg)
# print(results_df.groupby(['kw', 'idea_model']).size())

last_run_index = agg.values.max()
print(last_run_index)
print('Current maximum count:', last_run_index)
# Get the current minimum count
last_run_index = agg.values.min()
print(last_run_index)
print('Current minimum count (needs to be supplemented):', last_run_index)

# Count how many models have the same count as the minimum count
print(agg.values.min()) 
print(agg.values.min() == agg.values)
print(sum(agg.values.min() == agg.values))

In [None]:
import seaborn as sns
sns.histplot(agg.values)

In [None]:
string = df['raw_critique'].tolist()
print(string[2])

In [None]:
score_dict = eval(df['parsed_scores'].iloc[0])
print(score_dict)
score_dict['originality']
score_dict['feasibility']
score_dict['clarity']

In [None]:
# Define a function that returns None if the evaluation fails
def safe_eval(x):
    try:
        return eval(x)
    except:
        return None

# Apply the function and remove rows where the result is None
df['scores'] = df['parsed_scores'].apply(safe_eval)
df = df.dropna(subset=['scores'])

In [None]:
df['scores']

In [None]:
df['originality'] = df['scores'].apply(lambda x: x['originality'])
df['feasibility'] = df['scores'].apply(lambda x: x['feasibility'])
df['clarity'] = df['scores'].apply(lambda x: x['clarity'])

In [None]:
# Create a function to check if a scores dictionary contains any 'N/A' values
def has_na(scores_dict):
    return any(val == 'N/A' for val in scores_dict.values())

# Filter the dataframe to remove rows with 'N/A' scores
df = df[~df['scores'].apply(has_na)]

# Now calculate mean and min on the filtered dataframe
df['mean_score'] = df['scores'].apply(lambda x: sum(float(val) for val in x.values()) / len(x) if len(x) != 0 else 0)
df['min_score'] = df['scores'].apply(lambda x: min(float(val) for val in x.values()) if len(x) != 0 else 0)

In [None]:

df.groupby('idea_model')['idea_length_in_words'].mean().sort_values(ascending=False)

In [None]:
# Find the model with the highest mean_score, sort in descending order
# df.groupby('idea_model')['mean_score'].mean().sort_values(ascending=False)

df_view = df.groupby('idea_model').agg({'mean_score':['mean','std','count'],'idea_length_in_words':['mean','std','count']}).sort_values(('mean_score','mean'),ascending=False)
# df_view

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

def calculate_confidence_interval(mean, std, count, confidence=0.95):
    sem = std / np.sqrt(count)
    ci = stats.t.interval(confidence, count-1, loc=mean, scale=sem)
    return ci

# Assuming your dataframe is named df
results = []

# Directly using the values from the DataFrame
score_means = df_view[('mean_score', 'mean')]
score_stds = df_view[('mean_score', 'std')]
score_counts = df_view[('mean_score', 'count')]
length_means = df_view[('idea_length_in_words', 'mean')]
length_stds = df_view[('idea_length_in_words', 'std')]
length_counts = df_view[('idea_length_in_words', 'count')]

for i in range(len(df_view)):
    # Calculating the confidence interval
    score_ci = calculate_confidence_interval(
        score_means.iloc[i],
        score_stds.iloc[i],
        score_counts.iloc[i]
    )
    
    length_ci = calculate_confidence_interval(
        length_means.iloc[i],
        length_stds.iloc[i],
        length_counts.iloc[i]
    )
    
    results.append({
        'model': df_view.index[i],
        'score_mean': score_means.iloc[i],
        'score_ci_lower': score_ci[0],
        'score_ci_upper': score_ci[1],
        'score_ci_range': score_ci[1] - score_ci[0],
        'length_mean': length_means.iloc[i],
        'length_ci_lower': length_ci[0],
        'length_ci_upper': length_ci[1],
        'length_ci_range': length_ci[1] - length_ci[0]
    })

result_df = pd.DataFrame(results)

# Setting the display format
pd.set_option('display.float_format', '{:.4f}'.format)
# result_df

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import matplotlib.patheffects as path_effects

# Set color palette
sns.set_palette("husl")

scale = 0.8

# --- Part 1: Prepare data ---
# Assuming the df already has these three fields:
# df['originality'] = df['scores'].apply(lambda x: x['originality'])
# df['feasibility'] = df['scores'].apply(lambda x: x['feasibility'])
# df['clarity'] = df['scores'].apply(lambda x: x['clarity'])

# 1. Calculate mean and confidence interval
def calculate_confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    std = np.std(data)
    n = len(data)
    sem = std / np.sqrt(n)
    ci = stats.t.interval(confidence, n-1, loc=mean, scale=sem)
    return mean, ci[0], ci[1], ci[1] - ci[0]

# Calculate statistics by model
result_data = []
for model_name, group in df.groupby('idea_model'):
    # Calculate mean and confidence interval for all metrics
    orig_mean, orig_ci_lower, orig_ci_upper, orig_ci_range = calculate_confidence_interval(group['originality'])
    feas_mean, feas_ci_lower, feas_ci_upper, feas_ci_range = calculate_confidence_interval(group['feasibility'])
    clar_mean, clar_ci_lower, clar_ci_upper, clar_ci_range = calculate_confidence_interval(group['clarity'])
    
    # Calculate the overall score (average of the three metrics)
    score_mean = (orig_mean + feas_mean + clar_mean) / 3
    # Calculate the confidence interval for the overall score (simplified)
    score_ci_lower = (orig_ci_lower + feas_ci_lower + clar_ci_lower) / 3
    score_ci_upper = (orig_ci_upper + feas_ci_upper + clar_ci_upper) / 3
    score_ci_range = score_ci_upper - score_ci_lower
    
    # Calculate length statistics
    length_mean, length_ci_lower, length_ci_upper, length_ci_range = calculate_confidence_interval(group['idea_length_in_words'])
    
    result_data.append({
        'idea_model': model_name,
        'score_mean': score_mean,
        'score_ci_lower': score_ci_lower,
        'score_ci_upper': score_ci_upper,
        'score_ci_range': score_ci_range,
        'length_mean': length_mean,
        'length_ci_lower': length_ci_lower,
        'length_ci_upper': length_ci_upper,
        'length_ci_range': length_ci_range,
        'originality_mean': orig_mean,
        'originality_ci_lower': orig_ci_lower,
        'originality_ci_upper': orig_ci_upper,
        'originality_ci_range': orig_ci_range,
        'feasibility_mean': feas_mean,
        'feasibility_ci_lower': feas_ci_lower,
        'feasibility_ci_upper': feas_ci_upper,
        'feasibility_ci_range': feas_ci_range,
        'clarity_mean': clar_mean,
        'clarity_ci_lower': clar_ci_lower,
        'clarity_ci_upper': clar_ci_upper,
        'clarity_ci_range': clar_ci_range
    })

result_df = pd.DataFrame(result_data)

scatter_size = 30

# Simplify model names
result_df['short_name'] = result_df['idea_model'].apply(lambda x: clean_model_name(x))

# Calculate y-axis label font size - the default font size is typically 10, we take 80%
y_label_fontsize = 11  # 10 * 0.8 = 8

# --- First plot: two subplots arranged horizontally ---
fig1, (ax1, ax2) = plt.subplots(1, 2, figsize=(16*scale, 9*scale))  # Arrange 2 subplots horizontally

# Sort by score_mean in descending order
sorted_df = result_df.sort_values('score_mean', ascending=True)

# Create y-coordinates
y_pos = np.arange(len(sorted_df))

# Update colors: use #52ccc3 for average score
avg_score_color = '#52ccc3'
length_color = 'grey'  # Keep length color unchanged

# Plot score chart - left subplot
ax1.scatter(sorted_df['score_mean'], y_pos, color=avg_score_color, s=scatter_size, zorder=3)

# Add rounded error bars and value labels
for i, row in enumerate(sorted_df.itertuples()):
    ax1.hlines(y=i, xmin=row.score_ci_lower, xmax=row.score_ci_upper,
              color=avg_score_color, alpha=0.9, linewidth=3.0, capstyle='round')
    # Add label text with white border effect
    label_text = f"{row.score_mean:.2f}±{row.score_ci_range/2:.2f}"
    text = ax1.text(row.score_ci_upper + 0.05, i, label_text, 
            va='center', ha='left', fontsize=9, color=avg_score_color)
    text.set_path_effects([path_effects.withStroke(linewidth=4, foreground='white')])

# Set y-axis tick labels
ax1.set_yticks(y_pos)
ax1.set_yticklabels(sorted_df['short_name'], fontsize=y_label_fontsize)  # Modify here

# Set title and labels
ax1.set_title('Idea Score Avg. by Model: $\\frac{1}{3}$ (Orig. + Feas. + Clar.)')
ax1.set_xlabel('Score')
# Remove "Model" label
# ax1.set_ylabel('Model')

# Set grid lines
ax1.grid(True, axis='both', linestyle='--', alpha=0.7)

# Adjust x-axis range
x_max = sorted_df['score_ci_upper'].max() + 0.3
ax1.set_xlim(right=x_max)

# Keep only left and bottom border lines
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)

# Plot length chart - right subplot
ax2.scatter(sorted_df['length_mean'], y_pos, color=length_color, s=scatter_size, zorder=3)

# Add rounded error bars and value labels
for i, row in enumerate(sorted_df.itertuples()):
    ax2.hlines(y=i, xmin=row.length_ci_lower, xmax=row.length_ci_upper,
              color=length_color, alpha=0.9, linewidth=5.0, capstyle='round')
    # Add label text with white border effect
    label_text = f"{row.length_mean:.2f}±{row.length_ci_range/2:.2f}"
    text = ax2.text(row.length_ci_upper + 5, i, label_text, 
            va='center', ha='left', fontsize=9, color=length_color)
    text.set_path_effects([path_effects.withStroke(linewidth=4, foreground='white')])

# Set y-axis tick labels
ax2.set_yticks(y_pos)
ax2.set_yticklabels(sorted_df['short_name'], fontsize=y_label_fontsize)  # Modify here

# Set title and labels
ax2.set_title('Idea Length Avg. by Model')
ax2.set_xlabel('Length (Words)')
# Remove "Model" label
# ax2.set_ylabel('Model')

# Set grid lines
ax2.grid(True, axis='both', linestyle='--', alpha=0.7)

# Adjust x-axis range
x_max = sorted_df['length_ci_upper'].max() + 30
ax2.set_xlim(right=x_max)

# Keep only left and bottom border lines
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()
plt.savefig('figs/mean_score_and_length_rank.pdf')
plt.show()

# --- Second plot: three subplots arranged horizontally ---
fig2, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20*scale, 9*scale))  # Arrange 3 subplots horizontally

# Update color settings
metrics = [
    {'name': 'originality', 'color': '#66ccff', 'title': 'Originality Score'},  # Use new color
    {'name': 'feasibility', 'color': '#5092e4', 'title': 'Feasibility Score'},  # Use new color
    {'name': 'clarity', 'color': '#ffa500', 'title': 'Clarity Score'}  # Use new color
]

axes = [ax1, ax2, ax3]

for i, metric in enumerate(metrics):
    metric_name = metric['name']
    
    # Sort by the current metric in descending order
    sorted_df = result_df.sort_values(f'{metric_name}_mean', ascending=True)
    
    # Create y-coordinates
    y_pos = np.arange(len(sorted_df))
    
    ax = axes[i]
    
    # Plot scatter
    ax.scatter(sorted_df[f'{metric_name}_mean'], y_pos, 
              color=metric['color'], s=scatter_size, zorder=3)
    
    # Add rounded error bars and value labels
    for j, row in enumerate(sorted_df.itertuples()):
        mean_val = getattr(row, f'{metric_name}_mean')
        ci_lower = getattr(row, f'{metric_name}_ci_lower')
        ci_upper = getattr(row, f'{metric_name}_ci_upper')
        ci_range = getattr(row, f'{metric_name}_ci_range')
        
        ax.hlines(y=j, xmin=ci_lower, xmax=ci_upper,
                color=metric['color'], alpha=0.9, linewidth=3.0, capstyle='round')
        
        # Add label text with white border effect
        label_text = f"{mean_val:.2f}±{ci_range/2:.2f}"
        text = ax.text(ci_upper + 0.05, j, label_text, 
               va='center', ha='left', fontsize=9, color=metric['color'])
        text.set_path_effects([path_effects.withStroke(linewidth=4, foreground='white')])
    
    # Set y-axis tick labels
    ax.set_yticks(y_pos)
    ax.set_yticklabels(sorted_df['short_name'], fontsize=y_label_fontsize)  # Modify here
    
    # Set title and labels
    ax.set_title(f'LiveIdeaBench {metric["title"]}')
    ax.set_xlabel('Score')

    # Adjust x-axis range
    x_max = sorted_df[f'{metric_name}_ci_upper'].max() + 0.5
    ax.set_xlim(right=x_max)
    
    # Set grid lines
    ax.grid(True, axis='both', linestyle='--', alpha=0.7)
    
    # Keep only left and bottom border lines
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

# Adjust layout
plt.tight_layout()
plt.savefig('figs/originality_feasibility_clarity_scores.pdf')
plt.show()

# Create a detailed table for the three metrics
detailed_cols = ['short_name']
for metric in ['originality', 'feasibility', 'clarity']:
    detailed_cols.extend([f'{metric}_mean', f'{metric}_ci_range'])

detailed_df = result_df[detailed_cols].sort_values('originality_mean', ascending=False)
print("\nDetailed Metrics Table (sorted by originality_mean):")
print(detailed_df.to_string(index=False))

In [None]:
# Import the required libraries
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
from scipy import stats
import numpy as np
import matplotlib.ticker as ticker

# Set the canvas and grid - further reduce the width of the color axis
fig = plt.figure(figsize=(10, 8))
gs = gridspec.GridSpec(4, 5, width_ratios=[0.3, 1.6, 1.6, 1.6, 1])  # Reduce the width of the left color axis

# Create the main plot and marginal distribution plots
ax_main = plt.subplot(gs[1:4, 1:4])  # Main plot
ax_x = plt.subplot(gs[0, 1:4])       # X-axis marginal distribution (top)
ax_y = plt.subplot(gs[1:4, 4])       # Y-axis marginal distribution (right)
cbar_ax = plt.subplot(gs[1:4, 0])    # Color axis (left, narrower)

# Plot the hexbin on the main plot
hb = ax_main.hexbin(df['idea_length_in_words'], df['mean_score'], 
                   gridsize=30, cmap='Blues', mincnt=1, bins='log')

# Draw the regression line
sns.regplot(x='idea_length_in_words', y='mean_score', data=df,
            scatter=False, color='crimson', line_kws={'linewidth': 2}, ax=ax_main)

# Calculate the correlation coefficient and p-value
corr, p_value = stats.pearsonr(df['idea_length_in_words'], df['mean_score'])

# Determine the text to display based on the p-value
if p_value < 0.0001:
    p_text = 'P < 0.0001'
elif p_value < 0.001:
    p_text = 'P < 0.001'
elif p_value < 0.01:
    p_text = 'P < 0.01'
elif p_value < 0.05:
    p_text = 'P < 0.05'
else:
    p_text = f'P = {p_value:.3f}'
    
# Add the correlation coefficient information - increase the font size
ax_main.text(0.95, 0.95, f'r = {corr:.3f}\n{p_text}',
             horizontalalignment='right',
             verticalalignment='top',
             transform=ax_main.transAxes,
             fontsize=14,  # Increase the font size
             bbox=dict(facecolor='white', alpha=0.8, edgecolor='none', boxstyle='round,pad=0.5'))

# Plot the X-axis marginal distribution (histogram)
sns.histplot(df['idea_length_in_words'], ax=ax_x, kde=False, color='#3484bf', alpha=0.7, 
             stat='density', linewidth=0, binwidth=5)
ax_x.set_xlim(0, 200)
ax_x.set_ylabel('Density', fontsize=12)
ax_x.spines['top'].set_visible(False)
ax_x.spines['right'].set_visible(False)
ax_x.set_xlabel('')  # Remove the x-axis label as the main plot already has one
ax_x.tick_params(axis='x', labelbottom=False)  # Remove the x-axis tick labels

# Set the minor ticks for the X-axis histogram
ax_x.xaxis.set_major_locator(ticker.MultipleLocator(25))  # Major ticks at multiples of 25
ax_x.xaxis.set_minor_locator(ticker.MultipleLocator(5))   # Minor ticks at multiples of 5 (consistent with binwidth)

# Plot the Y-axis marginal distribution (histogram)
sns.histplot(y=df['mean_score'], ax=ax_y, kde=False, color='#3484bf', alpha=0.7, 
             stat='density', linewidth=0, binwidth=1/3)
ax_y.set_ylim(ax_main.get_ylim())  # Keep the y-axis range consistent with the main plot
ax_y.set_xlabel('Density', fontsize=12)
ax_y.spines['top'].set_visible(False)
ax_y.spines['right'].set_visible(False)
ax_y.set_ylabel('')  # Remove the y-axis label as the main plot already has one
ax_y.tick_params(axis='y', labelleft=False)  # Remove the y-axis tick labels

# Set the minor ticks for the Y-axis histogram
ax_y.yaxis.set_major_locator(ticker.MultipleLocator(2))   # Major ticks at multiples of 2
ax_y.yaxis.set_minor_locator(ticker.MultipleLocator(1/3)) # Minor ticks at multiples of 1/3 (consistent with binwidth)

# Beautify the main plot and set the ticks
ax_main.spines['top'].set_visible(False)
ax_main.spines['right'].set_visible(False)
ax_main.set_xlim(0, 200)

# Synchronize the ticks of the main plot
ax_main.xaxis.set_major_locator(ticker.MultipleLocator(25))  # Major ticks at multiples of 25
ax_main.xaxis.set_minor_locator(ticker.MultipleLocator(5))   # Minor ticks at multiples of 5
ax_main.yaxis.set_major_locator(ticker.MultipleLocator(2))   # Major ticks at multiples of 2
ax_main.yaxis.set_minor_locator(ticker.MultipleLocator(1/3)) # Minor ticks at multiples of 1/3

# Set the title and labels
ax_main.set_xlabel('Idea Length (words)', fontsize=14)
ax_main.set_ylabel('Mean Score of Each Idea, $\\frac{1}{3}$ (Orig.+Feas.+Clar.)', fontsize=14)
plt.tick_params(axis='both', which='major', labelsize=12)

# Add the color bar on the left - narrower style
cbar = plt.colorbar(hb, cax=cbar_ax)
cbar.set_label('log(count)', fontsize=12)
cbar_ax.yaxis.set_label_position('left')  # Place the label on the left side
cbar_ax.yaxis.tick_left()  # Place the ticks on the left side

# Adjust the layout
plt.tight_layout()
plt.savefig('figs/idea_length_mean_score_corr.pdf', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
all_keywords = df['keywords'].unique().tolist()
all_idea_models = df['idea_model'].unique().tolist()
# all_keywords
all_idea_models

In [None]:
import matplotlib.pyplot as plt
import pandas.core.common as com

# Creating a DataFrame containing the models and their thresholds
thresholds = []
for model in all_idea_models:
    df_view = df[df['idea_model'] == model]
    threshold = df_view['mean_score'].quantile(0.3)
    # Extract the last part of the model name as a shorter display name
    short_name = model.split('/')[-1]
    thresholds.append({'model': short_name, 'threshold': threshold})

df_thresholds = pd.DataFrame(thresholds)
# Sort the thresholds in descending order
df_thresholds = df_thresholds.sort_values('threshold', ascending=True)

# Create the chart
plt.figure(figsize=(12, 5))
bars = plt.barh(df_thresholds['model'], df_thresholds['threshold'])
plt.xlabel('30th Percentile Score')
plt.title('Model Performance Comparison (30th Percentile)')

# Add value labels on top of each bar
for i, bar in enumerate(bars):
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, 
             f'{df_thresholds["threshold"].iloc[i]:.1f}', 
             va='center')

# Adjust the layout to avoid label truncation
plt.tight_layout()
plt.show()

In [None]:
df_view = df[:].groupby(['idea_model','idea','keywords']).agg({'originality':['mean'],'feasibility':['mean'],'clarity':['mean'],'mean_score':['mean'],'critic_model':['count']}).sort_values(('mean_score','mean'),ascending=False)
df_view = df_view[df_view[('critic_model','count')] == 3]
display(df_view)

In [None]:
import pandas as pd


df.to_parquet('./data/data.parquet')


# 模型拒绝率统计

In [None]:
overall_reject_rate = df['first_was_rejected'].sum() / len(df) * 100
print('The Overall rejection rate is {:.4f}%'.format(overall_reject_rate))

In [None]:
# Create a new column to mark true rejections
df['true_rejection'] = (df['first_was_rejected'] == True) & (df['first_reject_response'].str.len() > 2)

In [None]:
df[df['true_rejection']][['keywords','idea_model','first_reject_response']].sample(20)

In [None]:
# Calculate the total number of samples for each model
model_counts = df['idea_model'].value_counts()

# Calculate the number of samples that were truly rejected for each model
true_rejected_counts = df[df['true_rejection'] == True]['idea_model'].value_counts()

# Calculate the true rejection rates
true_rejection_rates = (true_rejected_counts / model_counts * 100).sort_values(ascending=False)

# Create a DataFrame containing the model name, total samples, rejected samples, and rejection rate
true_rejection_stats = pd.DataFrame({
    'Total Samples': model_counts,
    'Truly Rejected Samples': true_rejected_counts,
    'True Rejection Rate (%)': true_rejection_rates.round(2)
})

# Sort the DataFrame by true rejection rate in descending order
true_rejection_stats = true_rejection_stats.sort_values('True Rejection Rate (%)', ascending=False)

# Display the rejection rate statistics
print("Model Rejection Rate Ranking:")
display(true_rejection_stats)

In [None]:
# Calculate total samples per model
model_counts = df['idea_model'].value_counts()

# Calculate truly rejected samples per model
true_rejected_counts = df[df['true_rejection'] == True]['idea_model'].value_counts()

# Calculate true rejection rates
true_rejection_rates = (true_rejected_counts / model_counts * 100).sort_values(ascending=False)

# Create a DataFrame with model stats
rejection_stats = pd.DataFrame({
    'Total Samples': model_counts,
    'Rejected Samples': true_rejected_counts,
    'Rejection Rate (%)': true_rejection_rates.round(2)
})

# Sort by rejection rate in descending order
rejection_stats = rejection_stats.sort_values('Rejection Rate (%)', ascending=False)

# Filter out models with zero rejection rate
non_zero_rejection_stats = rejection_stats[rejection_stats['Rejection Rate (%)'] > 0]

# Display rejection rate statistics for all models
print("Model Rejection Rates Ranking (All Models):")
display(rejection_stats)

# Display only non-zero rejection models
print("\nModel Rejection Rates Ranking (Non-Zero Only):")
display(non_zero_rejection_stats)


# Set figure with higher resolution and better proportions
plt.figure(figsize=(10, 6), dpi=100)

# Create the plot with better-aligned x-tick positions, using only non-zero models
x = np.arange(len(non_zero_rejection_stats.index))
bars = plt.bar(x, non_zero_rejection_stats['Rejection Rate (%)'], color='skyblue', width=0.6)

# Add value labels directly above each bar
for i, bar in enumerate(bars):
    height = bar.get_height()
    plt.text(x[i], height + 0.005, f'{height:.2f}%', 
             ha='center', va='bottom', fontsize=10)

# Set x-ticks at the centers of the bars
plt.xticks(x, non_zero_rejection_stats.index, rotation=45, ha='right', fontsize=9)

# Set chart title and axis labels
plt.title('Rejection Rates by Model (Non-Zero Only)', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Model', fontsize=12, labelpad=10)
plt.ylabel('Rejection Rate (%)', fontsize=12)
plt.yticks(fontsize=9)

# Improve grid appearance
plt.grid(axis='y', linestyle='--', alpha=0.3, zorder=0)

# Add a thin border at the bottom
plt.axhline(y=0, color='black', linewidth=0.8)

# Ensure proper layout
plt.tight_layout()
plt.subplots_adjust(bottom=0.2)  # Add more space at the bottom for x labels
plt.savefig('figs/model_rejection_rates.pdf')
# Display the chart
plt.show()