In [None]:
from google.colab import drive
drive.mount('/content/drive', timeout_ms=128000000)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title
# Install required packages
!pip install openai anthropic pandas matplotlib seaborn

# Import necessary libraries
import os
import time
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import anthropic
from openai import OpenAI
from datetime import datetime
import json

# Setup Anthropic client
anthropic_client = anthropic.Anthropic(
    # Replace with your actual API key or use environment variables
    api_key="api-key",
)

# Setup OpenAI client
openai_client = OpenAI(
    # Replace with your actual API key or use environment variables
    api_key="api-key",
)



In [None]:
# @title
# Configuration parameters
MAX_ITERATIONS = 6  # Maximum number of iterations per experiment (including baseline)
MAX_IMPROVEMENTS = MAX_ITERATIONS - 1  # Maximum number of improvements after baseline
NUM_EXPERIMENTS = 10  # Number of experiments to run per configuration

# Helper function to extract text from Claude's response
def extract_text_from_content(content):
    """
    Extract text from Claude's response content, which could be a string,
    a list of blocks, or another structure.
    """
    if isinstance(content, str):
        return content
    elif isinstance(content, list):
        extracted_text = ""
        for block in content:
            if hasattr(block, 'text'):
                extracted_text += block.text
            elif isinstance(block, dict) and 'text' in block:
                extracted_text += block['text']
            elif isinstance(block, str):
                extracted_text += block
            else:
                # Try to convert to string if possible
                extracted_text += str(block)
        return extracted_text
    else:
        # For any other type, try to convert to string
        return str(content)

In [None]:
# @title
def generate_idea_with_claude(temperature=1.0, max_tokens=5000):
    """Generate a novel and interesting idea using Claude 3.7"""

    message = anthropic_client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{"role": "user", "content": "Write a unique idea."}]
    )

    # Properly handle structured response
    idea_text = extract_text_from_content(message.content)
    print(idea_text)
    return idea_text

def generate_idea_with_gpt(temperature=1.0, max_tokens=5000):
    """Generate a novel and interesting idea using GPT-4.5"""

    response = openai_client.chat.completions.create(
        model="gpt-4.5-preview-2025-02-27",
        messages=[
            {"role": "user", "content": "Write a unique idea."}
        ],
        response_format={"type": "text"},
        temperature=temperature,
        max_completion_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    idea_text = response.choices[0].message.content
    print(idea_text)
    return idea_text

In [None]:
# @title
def evaluate_idea_with_claude(idea, temperature=1.0, max_tokens=5000):
    """
    Evaluate if an idea is novel and interesting using Claude 3.7

    Returns:
        tuple: (evaluation_text, is_novel) where is_novel is a boolean
    """
    prompt = """
Is this idea entirely unique?
1. Explain why or why not.
2. If it is entirely unique, end your explanation with <true>.
If it is not entirely unique, end your explanation with <false>.
"""

    message = anthropic_client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{"role": "user", "content": f"{idea}\n\n{prompt}"}]
    )

    # Properly handle structured response
    response_text = extract_text_from_content(message.content)
    is_novel = '<true>' in response_text
    print(response_text)
    return response_text, is_novel

def evaluate_idea_with_gpt(idea, temperature=1.0, max_tokens=5000):
    """
    Evaluate if an idea is novel and interesting using GPT-4.5

    Returns:
        tuple: (evaluation_text, is_novel) where is_novel is a boolean
    """
    prompt = """
Is this idea entirely unique?
1. Explain why or why not.
2. If it is entirely unique, end your explanation with <true>.
If it is not entirely unique, end your explanation with <false>.
"""

    response = openai_client.chat.completions.create(
        model="gpt-4.5-preview-2025-02-27",
        messages=[
            {"role": "user", "content": f"{idea}\n\n{prompt}"}
        ],
        response_format={"type": "text"},
        temperature=temperature,
        max_completion_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    response_text = response.choices[0].message.content
    print(response_text)
    is_novel = '<true>' in response_text

    return response_text, is_novel

In [None]:
# @title
def improve_idea_with_claude(idea, temperature=1.0, max_tokens=5000):
    """Improve an idea using Claude 3.7"""

    message = anthropic_client.messages.create(
        model="claude-3-7-sonnet-20250219",
        max_tokens=max_tokens,
        temperature=temperature,
        messages=[{"role": "user", "content": f"Rewrite this idea to make it more unique: {idea}"}]
    )

    # Properly handle structured response
    improved_idea = extract_text_from_content(message.content)
    print(improved_idea)
    return improved_idea

def improve_idea_with_gpt(idea, temperature=1.0, max_tokens=5000):
    """Improve an idea using GPT-4.5"""

    response = openai_client.chat.completions.create(
        model="gpt-4.5-preview-2025-02-27",
        messages=[
            {"role": "user", "content": f"Rewrite this idea to make it more unique: {idea}"}
        ],
        response_format={"type": "text"},
        temperature=temperature,
        max_completion_tokens=max_tokens,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )

    improved_idea = response.choices[0].message.content
    print(improved_idea)
    return improved_idea

In [None]:
# @title
def run_experiment_iterations(writer_model, judge_model, temperature=1.0, idea_max_tokens=5000, eval_max_tokens=5000):
    """
    Run experiment with specific writer and judge models

    Args:
        writer_model: 'claude' or 'gpt'
        judge_model: 'claude' or 'gpt'
        temperature: Temperature for generation
        idea_max_tokens: Maximum tokens for idea generation
        eval_max_tokens: Maximum tokens for evaluation

    Returns:
        dict: Results of the experiment
    """
    # Initialize tracking
    baseline_iteration = 0
    improvement_count = 0
    max_improvements = MAX_IMPROVEMENTS  # Max improvements after baseline

    all_ideas = []
    all_evaluations = []
    is_accepted = False
    error_occurred = False
    error_message = None

    # Configuration details for recording
    model_info = {
        'claude': {
            'name': 'Claude 3.7 Sonnet',
            'version': 'claude-3-7-sonnet-20250219'
        },
        'gpt': {
            'name': 'GPT-4.5',
            'version': 'gpt-4.5-preview-2025-02-27'
        }
    }

    writer_details = model_info[writer_model]
    judge_details = model_info[judge_model]

    try:
        # BASELINE: Generate initial idea
        print(f"  💻Generating baseline idea with {writer_model}...💻")
        if writer_model == 'claude':
            baseline_idea = generate_idea_with_claude(temperature, idea_max_tokens)
        else:  # GPT
            baseline_idea = generate_idea_with_gpt(temperature, idea_max_tokens)

        current_idea = baseline_idea
        all_ideas.append({
            "iteration": baseline_iteration,
            "type": "baseline",
            "idea": current_idea,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        # BASELINE EVALUATION: Evaluate the baseline idea
        print(f"  ❓Evaluating baseline idea with {judge_model}...❓")
        if judge_model == 'claude':
            evaluation, is_accepted = evaluate_idea_with_claude(current_idea, temperature, eval_max_tokens)
        else:  # GPT
            evaluation, is_accepted = evaluate_idea_with_gpt(current_idea, temperature, eval_max_tokens)

        all_evaluations.append({
            "iteration": baseline_iteration,
            "type": "baseline",
            "evaluation": evaluation,
            "is_accepted": is_accepted,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

        print(f"  Baseline idea {'accepted⭐' if is_accepted else '❌rejected'}")

        # If baseline is accepted, no need for improvements
        if is_accepted:
            print("  ⭐Baseline idea was accepted. No improvements needed.")
        else:
            # IMPROVEMENT LOOP: Improve and evaluate until accepted or max improvements reached
            while improvement_count < max_improvements and not is_accepted:
                # Increment improvement counter
                improvement_count += 1
                current_iteration = baseline_iteration + improvement_count

                print(f"  ⬆️Improvement {improvement_count}/{max_improvements}: ⬆️Improving idea with {writer_model}...")

                # Improve the idea
                if writer_model == 'claude':
                    current_idea = improve_idea_with_claude(current_idea, temperature, idea_max_tokens)
                else:  # GPT
                    current_idea = improve_idea_with_gpt(current_idea, temperature, idea_max_tokens)

                all_ideas.append({
                    "iteration": current_iteration,
                    "type": "improvement",
                    "improvement_number": improvement_count,
                    "idea": current_idea,
                    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

                # Evaluate the improved idea
                print(f"  ⬆️Improvement {improvement_count}/{max_improvements}: ❓Evaluating idea with {judge_model}...")
                if judge_model == 'claude':
                    evaluation, is_accepted = evaluate_idea_with_claude(current_idea, temperature, eval_max_tokens)
                else:  # GPT
                    evaluation, is_accepted = evaluate_idea_with_gpt(current_idea, temperature, eval_max_tokens)

                all_evaluations.append({
                    "iteration": current_iteration,
                    "type": "improvement",
                    "improvement_number": improvement_count,
                    "evaluation": evaluation,
                    "is_accepted": is_accepted,
                    "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                })

                print(f"  ⬆️Improvement {improvement_count}/{max_improvements}: Idea {'⭐accepted' if is_accepted else '❌rejected'}")

                # If the idea is accepted, stop improvements
                if is_accepted:
                    print(f"  ⭐Idea accepted after {improvement_count} improvements.")
                    break

            if not is_accepted:
                print(f"  ❌Reached maximum number of improvements ({max_improvements}) without acceptance.")

        # Compile results
        result = {
            "writer_model": writer_model,
            "judge_model": judge_model,
            "writer_details": writer_details,
            "judge_details": judge_details,
            "temperature": temperature,
            "idea_max_tokens": idea_max_tokens,
            "eval_max_tokens": eval_max_tokens,
            "improvements_to_acceptance": improvement_count if is_accepted else -1,
            "total_iterations": improvement_count + 1,  # Baseline counts as one iteration
            "was_accepted": is_accepted,
            "baseline_was_accepted": is_accepted and improvement_count == 0,
            "all_ideas": all_ideas,
            "all_evaluations": all_evaluations,
            "error_occurred": error_occurred,
            "error_message": error_message,
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        }

        return result

    except Exception as e:
        error_message = str(e)
        error_occurred = True
        print(f"  Error in experiment: {error_message}")
        # Return partial results if available
        if len(all_ideas) > 0:
            return {
                "writer_model": writer_model,
                "judge_model": judge_model,
                "writer_details": writer_details,
                "judge_details": judge_details,
                "temperature": temperature,
                "idea_max_tokens": idea_max_tokens,
                "eval_max_tokens": eval_max_tokens,
                "improvements_to_acceptance": -1,
                "total_iterations": baseline_iteration + improvement_count + 1,
                "was_accepted": False,
                "baseline_was_accepted": False,
                "all_ideas": all_ideas,
                "all_evaluations": all_evaluations,
                "error_occurred": error_occurred,
                "error_message": error_message,
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }
        else:
            return {
                "writer_model": writer_model,
                "judge_model": judge_model,
                "writer_details": writer_details,
                "judge_details": judge_details,
                "error_occurred": error_occurred,
                "error_message": error_message,
                "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            }

In [None]:
# @title


In [None]:
# @title
def run_all_experiments(num_experiments=3, temperature=1.0, idea_max_tokens=5000, eval_max_tokens=5000):
    """
    Run experiments for all configurations

    Args:
        num_experiments: Number of experiments per configuration
        temperature: Temperature for generation
        idea_max_tokens: Maximum tokens for idea generation
        eval_max_tokens: Maximum tokens for evaluation

    Returns:
        list: Results of all experiments
    """
    all_results = []
    failed_experiments = 0
    configurations = [
        ("claude", "gpt"),    # Claude 3.7 as writer, GPT-4.5 as judge
        ("gpt", "claude"),    # GPT-4.5 as writer, Claude 3.7 as judge
        ("claude", "claude"), # Claude 3.7 as writer, Claude 3.7 as judge
        ("gpt", "gpt")        # GPT-4.5 as writer, GPT-4.5 as judge
    ]

    total_experiments = len(configurations) * num_experiments
    experiment_count = 0

    for config in configurations:
        writer, judge = config
        print(f"\nRunning experiments for 💻{writer.upper()} as writer and ❓{judge.upper()} as judge...")

        config_failed = 0

        for i in range(num_experiments):
            experiment_count += 1
            progress = (experiment_count / total_experiments) * 100
            print(f"Experiment {i+1}/{num_experiments} (Overall progress: {progress:.1f}%)...")

            # Add a small delay to avoid rate limiting
            if experiment_count > 1:
                delay = 3  # seconds
                print(f"Waiting {delay} seconds to avoid rate limiting...")
                time.sleep(delay)

            result = run_experiment_iterations(
                writer, judge, temperature, idea_max_tokens, eval_max_tokens
            )

            if result is not None:
                all_results.append(result)

                # Track failed experiments
                if result.get("error_occurred", False):
                    failed_experiments += 1
                    config_failed += 1

                # Print summary
                if not result.get("error_occurred", False):
                    accepted_str = "ACCEPTED" if result["was_accepted"] else "NOT ACCEPTED"

                    if result["was_accepted"]:
                        if result.get("baseline_was_accepted", False):
                            print(f"Result: {accepted_str} at baseline (no improvements needed)")
                        else:
                            improvements = result["improvements_to_acceptance"]
                            print(f"Result: {accepted_str} after {improvements} improvements")
                    else:
                        print(f"Result: {accepted_str} after maximum improvements")
                else:
                    print(f"Result: FAILED - {result.get('error_message', 'Unknown error')}")
            else:
                failed_experiments += 1
                config_failed += 1
                print("Experiment failed completely. Skipping.")

        print(f"{config_failed}/{num_experiments} experiments failed for {writer}/{judge} configuration")

    print(f"\nTotal failed experiments: {failed_experiments}/{total_experiments} ({failed_experiments/total_experiments*100:.1f}%)")
    return all_results

In [None]:
# @title
def create_results_dataframe(all_results):
    """
    Convert experiment results to pandas DataFrames

    Args:
        all_results: List of experiment results

    Returns:
        tuple: (df_main, df_ideas, df_evaluations, df_flat) DataFrames
    """
    # Extract the main metrics
    main_data = []
    for idx, result in enumerate(all_results):
        # Skip completely failed experiments that have no ideas or evaluations
        if "all_ideas" not in result or "all_evaluations" not in result:
            main_data.append({
                "experiment_id": idx,
                "writer_model": result.get("writer_model", "unknown"),
                "judge_model": result.get("judge_model", "unknown"),
                "writer_name": result.get("writer_details", {}).get("name", "unknown"),
                "judge_name": result.get("judge_details", {}).get("name", "unknown"),
                "writer_version": result.get("writer_details", {}).get("version", "unknown"),
                "judge_version": result.get("judge_details", {}).get("version", "unknown"),
                "temperature": result.get("temperature", 0),
                "idea_max_tokens": result.get("idea_max_tokens", 0),
                "eval_max_tokens": result.get("eval_max_tokens", 0),
                "improvements_to_acceptance": -2,  # Use -2 to indicate failure
                "total_iterations": 0,
                "was_accepted": False,
                "baseline_was_accepted": False,
                "error_occurred": result.get("error_occurred", True),
                "error_message": result.get("error_message", "Unknown error"),
                "timestamp": result.get("timestamp", datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
            })
            continue

        main_data.append({
            "experiment_id": idx,
            "writer_model": result["writer_model"],
            "judge_model": result["judge_model"],
            "writer_name": result["writer_details"]["name"],
            "judge_name": result["judge_details"]["name"],
            "writer_version": result["writer_details"]["version"],
            "judge_version": result["judge_details"]["version"],
            "temperature": result["temperature"],
            "idea_max_tokens": result["idea_max_tokens"],
            "eval_max_tokens": result["eval_max_tokens"],
            "improvements_to_acceptance": result["improvements_to_acceptance"],
            "total_iterations": result["total_iterations"],
            "was_accepted": result["was_accepted"],
            "baseline_was_accepted": result.get("baseline_was_accepted", False),
            "error_occurred": result.get("error_occurred", False),
            "error_message": result.get("error_message", None),
            "timestamp": result["timestamp"]
        })

    # Create main dataframe
    df_main = pd.DataFrame(main_data)

    # Create detailed dataframes for ideas and evaluations
    idea_rows = []
    eval_rows = []

    for exp_idx, result in enumerate(all_results):
        # Skip completely failed experiments
        if "all_ideas" not in result or "all_evaluations" not in result:
            continue

        writer = result["writer_model"]
        judge = result["judge_model"]

        # Add ideas
        for idea_data in result["all_ideas"]:
            idea_type = idea_data.get("type", "unknown")
            improvement_number = idea_data.get("improvement_number", 0) if idea_type == "improvement" else 0

            idea_rows.append({
                "experiment_id": exp_idx,
                "writer_model": writer,
                "judge_model": judge,
                "iteration": idea_data["iteration"],
                "type": idea_type,
                "improvement_number": improvement_number,
                "idea_text": idea_data["idea"],
                "timestamp": idea_data["timestamp"]
            })

        # Add evaluations
        for eval_data in result["all_evaluations"]:
            eval_type = eval_data.get("type", "unknown")
            improvement_number = eval_data.get("improvement_number", 0) if eval_type == "improvement" else 0

            eval_rows.append({
                "experiment_id": exp_idx,
                "writer_model": writer,
                "judge_model": judge,
                "iteration": eval_data["iteration"],
                "type": eval_type,
                "improvement_number": improvement_number,
                "evaluation_text": eval_data["evaluation"],
                "is_accepted": eval_data["is_accepted"],
                "timestamp": eval_data["timestamp"]
            })

    df_ideas = pd.DataFrame(idea_rows)
    df_evaluations = pd.DataFrame(eval_rows)

    # Create flattened version that's CSV-friendly
    flat_rows = []

    for idx, row in df_main.iterrows():
        exp_id = row['experiment_id']

        # Skip failed experiments
        if row.get('error_occurred', False):
            flat_rows.append({
                "experiment_id": exp_id,
                "iteration": -1,
                "type": "error",
                "writer_model": row['writer_model'],
                "judge_model": row['judge_model'],
                "writer_name": row['writer_name'],
                "judge_name": row['judge_name'],
                "writer_version": row['writer_version'],
                "judge_version": row['judge_version'],
                "temperature": row['temperature'],
                "idea_max_tokens": row['idea_max_tokens'],
                "eval_max_tokens": row['eval_max_tokens'],
                "improvements_to_acceptance": row['improvements_to_acceptance'],
                "total_iterations": row['total_iterations'],
                "was_accepted": row['was_accepted'],
                "baseline_was_accepted": row['baseline_was_accepted'],
                "experiment_timestamp": row['timestamp'],
                "idea_text": "",
                "evaluation_text": "",
                "is_iteration_accepted": False,
                "error_occurred": True,
                "error_message": row.get('error_message', "Unknown error")
            })
            continue

        # Get ideas for this experiment
        exp_ideas = df_ideas[df_ideas['experiment_id'] == exp_id] if not df_ideas.empty else pd.DataFrame()

        # Get evaluations for this experiment
        exp_evals = df_evaluations[df_evaluations['experiment_id'] == exp_id] if not df_evaluations.empty else pd.DataFrame()

        # Skip if no ideas or evaluations
        if exp_ideas.empty and exp_evals.empty:
            continue

        # For each iteration, create a row with both idea and evaluation
        max_iter = max(
            exp_ideas['iteration'].max() if not exp_ideas.empty else -1,
            exp_evals['iteration'].max() if not exp_evals.empty else -1
        )

        for iter_num in range(max_iter + 1):
            # Get idea for this iteration (if exists)
            idea_row = exp_ideas[exp_ideas['iteration'] == iter_num] if not exp_ideas.empty else pd.DataFrame()
            idea_text = idea_row['idea_text'].iloc[0] if not idea_row.empty else ""
            idea_type = idea_row['type'].iloc[0] if not idea_row.empty else "unknown"
            improvement_number = idea_row['improvement_number'].iloc[0] if not idea_row.empty else 0

            # Get evaluation for this iteration (if exists)
            eval_row = exp_evals[exp_evals['iteration'] == iter_num] if not exp_evals.empty else pd.DataFrame()
            eval_text = eval_row['evaluation_text'].iloc[0] if not eval_row.empty else ""
            is_accepted = eval_row['is_accepted'].iloc[0] if not eval_row.empty else False

            # Create a flat row with all information
            flat_row = {
                "experiment_id": exp_id,
                "iteration": iter_num,
                "type": idea_type,
                "improvement_number": improvement_number,
                "writer_model": row['writer_model'],
                "judge_model": row['judge_model'],
                "writer_name": row['writer_name'],
                "judge_name": row['judge_name'],
                "writer_version": row['writer_version'],
                "judge_version": row['judge_version'],
                "temperature": row['temperature'],
                "idea_max_tokens": row['idea_max_tokens'],
                "eval_max_tokens": row['eval_max_tokens'],
                "improvements_to_acceptance": row['improvements_to_acceptance'],
                "total_iterations": row['total_iterations'],
                "was_accepted": row['was_accepted'],
                "baseline_was_accepted": row['baseline_was_accepted'],
                "experiment_timestamp": row['timestamp'],
                "idea_text": idea_text,
                "evaluation_text": eval_text,
                "is_iteration_accepted": is_accepted,
                "error_occurred": False,
                "error_message": None
            }

            flat_rows.append(flat_row)

    df_flat = pd.DataFrame(flat_rows)

    return df_main, df_ideas, df_evaluations, df_flat

In [None]:
# @title
# Run all experiments
print("Starting all experiments...")
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
print(f"Experiment run timestamp: {timestamp}")

# Record parameters
print(f"\nExperiment parameters:")
print(f"Number of experiments per configuration: {NUM_EXPERIMENTS}")
print(f"Maximum iterations per experiment: {MAX_ITERATIONS} (1 baseline + up to {MAX_IMPROVEMENTS} improvements)")

# Run experiments
all_results = run_all_experiments(
    num_experiments=NUM_EXPERIMENTS
)

# Create dataframes for analysis
df_main, df_ideas, df_evaluations, df_flat = create_results_dataframe(all_results)

# Save to CSV files
df_main.to_csv(f"/content/drive/MyDrive/Novelty_experiments/uniqueness_evaluation_using_llm_generated_baselines_entirely_unique_/novelty_experiments_main_{timestamp}.csv", index=False)
df_ideas.to_csv(f"/content/drive/MyDrive/Novelty_experiments/uniqueness_evaluation_using_llm_generated_baselines_entirely_unique_/novelty_experiments_ideas_{timestamp}.csv", index=False)
df_evaluations.to_csv(f"/content/drive/MyDrive/Novelty_experiments/uniqueness_evaluation_using_llm_generated_baselines_entirely_unique_/novelty_experiments_evaluations_{timestamp}.csv", index=False)
df_flat.to_csv(f"/content/drive/MyDrive/Novelty_experiments/uniqueness_evaluation_using_llm_generated_baselines_entirely_unique_/novelty_experiments_all_data_llm-baseline_{timestamp}.csv", index=False)

# Save raw results to JSON
with open(f"novelty_experiments_raw_{timestamp}.json", 'w') as f:
    json.dump(all_results, f, indent=2, default=str)

print(f"\nResults saved with timestamp: {timestamp}")
print(f"Files created:")
print(f"  - novelty_experiments_main_{timestamp}.csv")
print(f"  - novelty_experiments_ideas_{timestamp}.csv")
print(f"  - novelty_experiments_evaluations_{timestamp}.csv")
print(f"  - novelty_experiments_all_data_{timestamp}.csv")
print(f"  - novelty_experiments_raw_{timestamp}.json")

Starting all experiments...
Experiment run timestamp: 20250302_045210

Experiment parameters:
Number of experiments per configuration: 10
Maximum iterations per experiment: 6 (1 baseline + up to 5 improvements)

Running experiments for 💻CLAUDE as writer and ❓GPT as judge...
Experiment 1/10 (Overall progress: 2.5%)...
  💻Generating baseline idea with claude...💻
# The Digital Time Capsule Museum

Imagine a museum where nothing is physically displayed. Instead, visitors wear AR glasses to view "time capsules" - digital collections preserved exactly as they were at specific moments in history. 

Each capsule contains a person's complete digital footprint from a particular day: their social media posts, search history, photos taken, music streamed, articles read, and messages sent (with privacy protections).

The museum would display capsules from diverse individuals across decades - from the early internet days through our present. Visitors could experience how digital life evolved, see hi

In [None]:
# @title
# Count the number of failed experiments
failed_experiments = df_main[df_main['error_occurred'] == True]
successful_experiments = df_main[df_main['error_occurred'] == False]

print("\nBasic Analysis:")
print(f"Total experiments: {len(df_main)}")
print(f"Failed experiments: {len(failed_experiments)} ({len(failed_experiments) / len(df_main) * 100:.1f}%)")
print(f"Successful experiments: {len(successful_experiments)} ({len(successful_experiments) / len(df_main) * 100:.1f}%)")

if len(successful_experiments) > 0:
    # Assuming all_results, df_main, df_ideas, df_evaluations, and df_flat are already available

    # Fix the summary creation
    print("\nSummary by configuration:")

    # Create summary statistics (using only successful experiments)
    successful_experiments = df_main[df_main['error_occurred'] == False]
    summary = successful_experiments.groupby(['writer_model', 'judge_model']).agg({
        'was_accepted': 'mean',
        'baseline_was_accepted': 'mean',
        'improvements_to_acceptance': ['mean', 'median', 'count']
    })

    # Reset index to make groupby keys regular columns
    summary = summary.reset_index()

    # Flatten multi-level columns correctly
    if isinstance(summary.columns, pd.MultiIndex):
        # Create new column names
        new_cols = []
        for col in summary.columns:
            if isinstance(col, tuple):
                if col[0] in ['was_accepted', 'baseline_was_accepted', 'improvements_to_acceptance']:
                    new_cols.append(f"{col[0]}_{col[1]}")
                else:
                    # For groupby keys, use the name without underscore
                    new_cols.append(col[0])
            else:
                new_cols.append(col)

        summary.columns = new_cols

    # Calculate acceptance rates
    summary['acceptance_rate'] = summary['was_accepted_mean'] * 100
    summary['baseline_acceptance_rate'] = summary['baseline_was_accepted_mean'] * 100

    # Handle improved accepted data
    accepted_df = successful_experiments[successful_experiments['was_accepted'] == True]
    improved_accepted_df = accepted_df[accepted_df['baseline_was_accepted'] == False]

    if not improved_accepted_df.empty:
        improved_accepted_only = improved_accepted_df.groupby(
            ['writer_model', 'judge_model']
        )['improvements_to_acceptance'].mean().reset_index()

        improved_accepted_only = improved_accepted_only.rename(
            columns={'improvements_to_acceptance': 'avg_improvements_when_needed'}
        )

        summary = summary.merge(
            improved_accepted_only,
            on=['writer_model', 'judge_model'],
            how='left'
        )
    else:
        summary['avg_improvements_when_needed'] = None

    print(summary)

# Now you can continue with the plotting code as provided in my previous response

    # ----------------- Main Plot -----------------
    plt.figure(figsize=(15, 12))

    # Plot 1: Overall Acceptance rate by configuration
    plt.subplot(2, 2, 1)
    if 'acceptance_rate' in summary.columns:
        sns.barplot(x='writer_model', y='acceptance_rate', hue='judge_model', data=summary)
        plt.title('Overall Acceptance Rate by Configuration')
        plt.xlabel('Writer Model')
        plt.ylabel('Acceptance Rate (%)')
        plt.ylim(0, 100)
    else:
        plt.text(0.5, 0.5, "No acceptance data available", horizontalalignment='center', verticalalignment='center')
        plt.title('Overall Acceptance Rate by Configuration')

    # Plot 2: Baseline acceptance rate
    plt.subplot(2, 2, 2)
    if 'baseline_acceptance_rate' in summary.columns:
        sns.barplot(x='writer_model', y='baseline_acceptance_rate', hue='judge_model', data=summary)
        plt.title('Baseline Idea Acceptance Rate')
        plt.xlabel('Writer Model')
        plt.ylabel('Baseline Acceptance Rate (%)')
        plt.ylim(0, 100)
    else:
        plt.text(0.5, 0.5, "No baseline acceptance data", horizontalalignment='center', verticalalignment='center')
        plt.title('Baseline Idea Acceptance Rate')

    # Plot 3: Number of experiments by configuration
    plt.subplot(2, 2, 3)
    if 'improvements_to_acceptance_count' in summary.columns:
        sns.barplot(x='writer_model', y='improvements_to_acceptance_count', hue='judge_model', data=summary)
        plt.title('Number of Experiments by Configuration')
        plt.xlabel('Writer Model')
        plt.ylabel('Count')
    else:
        plt.text(0.5, 0.5, "No experiment count data available", horizontalalignment='center', verticalalignment='center')
        plt.title('Number of Experiments by Configuration')

    # Plot 4: Average improvements needed when baseline was rejected but idea was eventually accepted
    plt.subplot(2, 2, 4)
    if 'avg_improvements_when_needed' in summary.columns and not summary['avg_improvements_when_needed'].isna().all():
        sns.barplot(x='writer_model', y='avg_improvements_when_needed', hue='judge_model', data=summary)
        plt.title('Avg Improvements When Needed\n(Baseline Rejected but Eventually Accepted)')
        plt.xlabel('Writer Model')
        plt.ylabel('Avg Improvements')
    else:
        if len(accepted_df) > 0 and accepted_df['baseline_was_accepted'].sum() == len(accepted_df):
            plt.text(0.5, 0.5, "All accepted ideas were accepted at baseline\n(no improvements needed)",
                     horizontalalignment='center', verticalalignment='center')
        elif len(accepted_df) == 0:
            plt.text(0.5, 0.5, "No ideas were accepted in this run",
                     horizontalalignment='center', verticalalignment='center')
        else:
            plt.text(0.5, 0.5, "No data available for improvements when needed",
                     horizontalalignment='center', verticalalignment='center')
        plt.title('Avg Improvements When Needed\n(Baseline Rejected but Eventually Accepted)')

    plt.tight_layout()
    plt.savefig(f"novelty_experiments_analysis_{timestamp}.png")
    plt.show()

    # ----------------- Detailed Plot -----------------
    plt.figure(figsize=(15, 10))

    # Detailed Plot: Distribution of improvements needed for acceptance
    if len(accepted_df) > 0:
        plt.subplot(2, 1, 1)
        # Count number of ideas by improvements needed
        improvements_dist = accepted_df.groupby(['writer_model', 'judge_model', 'improvements_to_acceptance']).size().reset_index(name='count')
        sns.barplot(data=improvements_dist, x='improvements_to_acceptance', y='count', hue='writer_model')
        plt.title('Distribution of Improvements Needed for Acceptance')
        plt.xlabel('Number of Improvements')
        plt.ylabel('Count')
    else:
        plt.text(0.5, 0.5, "No accepted ideas to plot detailed improvements distribution", horizontalalignment='center', verticalalignment='center')
        plt.title('Distribution of Improvements Needed for Acceptance')

    plt.tight_layout()
    plt.savefig(f"novelty_experiments_detailed_{timestamp}.png")
    plt.show()

In [None]:
# @title
# Make sure we have the right columns in our summary DataFrame
print("Summary columns before adding acceptance stats:", summary.columns.tolist())

# Calculate absolute acceptances for each configuration
acceptance_counts = successful_experiments.groupby(['writer_model', 'judge_model']).agg({
    'was_accepted': ['count', 'sum']
}).reset_index()

# Flatten the multi-level columns properly
if isinstance(acceptance_counts.columns, pd.MultiIndex):
    new_cols = []
    for col in acceptance_counts.columns:
        if isinstance(col, tuple):
            if col[0] == 'was_accepted':
                if col[1] == 'count':
                    new_cols.append('total_experiments')
                elif col[1] == 'sum':
                    new_cols.append('absolute_acceptances')
            else:
                new_cols.append(col[0])
        else:
            new_cols.append(col)
    acceptance_counts.columns = new_cols

print("Acceptance counts columns:", acceptance_counts.columns.tolist())
print("Acceptance counts data:", acceptance_counts)

# Merge with summary
summary = summary.merge(acceptance_counts, on=['writer_model', 'judge_model'], how='left')

# Add a column for presenting the acceptance statistic in a more readable format
summary['acceptance_stats'] = summary['absolute_acceptances'].astype(int).astype(str) + ' / ' + \
                             summary['total_experiments'].astype(int).astype(str) + \
                             ' (' + summary['acceptance_rate'].round(1).astype(str) + '%)'

print("Summary columns after adding acceptance stats:", summary.columns.tolist())
print("\nSummary of acceptances by configuration:")
print(summary[['writer_model', 'judge_model', 'absolute_acceptances', 'total_experiments', 'acceptance_rate', 'acceptance_stats']])

# Now create the plots with the updated summary DataFrame
# ----------------- Absolute Acceptances Plot -----------------
plt.figure(figsize=(15, 8))

# Plot absolute acceptances by configuration
plt.subplot(1, 2, 1)
sns.barplot(x='writer_model', y='absolute_acceptances', hue='judge_model', data=summary)
plt.title('Absolute Number of Accepted Ideas by Configuration')
plt.xlabel('Writer Model')
plt.ylabel('Number of Accepted Ideas')

# Add text labels on top of bars
for i, (_, row) in enumerate(summary.iterrows()):
    writer_models = summary['writer_model'].unique()
    judge_models = summary['judge_model'].unique()

    # Calculate x position for the bar
    writer_idx = list(writer_models).index(row['writer_model'])
    judge_idx = list(judge_models).index(row['judge_model'])

    x_pos = writer_idx + (judge_idx * 0.4 - 0.2)

    plt.text(x_pos, row['absolute_acceptances'] + 0.1,
             f"{int(row['absolute_acceptances'])}/{int(row['total_experiments'])}",
             ha='center', va='bottom')

# Plot acceptance rates (percentages) for comparison
plt.subplot(1, 2, 2)
sns.barplot(x='writer_model', y='acceptance_rate', hue='judge_model', data=summary)
plt.title('Acceptance Rate by Configuration (%)')
plt.xlabel('Writer Model')
plt.ylabel('Acceptance Rate (%)')
plt.ylim(0, 100)

# Add text annotations with the absolute numbers
for i, (_, row) in enumerate(summary.iterrows()):
    writer_models = summary['writer_model'].unique()
    judge_models = summary['judge_model'].unique()

    # Calculate x position for the bar
    writer_idx = list(writer_models).index(row['writer_model'])
    judge_idx = list(judge_models).index(row['judge_model'])

    x_pos = writer_idx + (judge_idx * 0.4 - 0.2)

    plt.text(x_pos, row['acceptance_rate'] + 2,
             f"{int(row['absolute_acceptances'])}/{int(row['total_experiments'])}",
             ha='center', va='bottom')

plt.tight_layout()
plt.savefig(f"novelty_experiments_acceptances_{timestamp}.png")
plt.show()

# Include this file in downloads for Google Colab
try:
    from google.colab import files
    files.download(f"novelty_experiments_acceptances_{timestamp}.png")
except ImportError:
    pass  # Not in Colab environment

In [None]:
# Add Google Colab file download functionality
try:
    from google.colab import files

    # Download the comprehensive CSV file
    print("\nDownloading the comprehensive CSV file...")
    files.download(f"/content/drive/MyDrive/Novelty_experiments/uniqueness_evaluation_using_llm_generated_baselines_entirely_unique_/novelty_experiments_all_data_{timestamp}.csv")

    # Download the analysis plots
    print("Downloading the analysis plots...")
    files.download(f"novelty_experiments_analysis_{timestamp}.png")
    try:
        files.download(f"novelty_experiments_detailed_{timestamp}.png")
    except:
        print("Detailed analysis plot not available")

    print("Files downloaded successfully")
except ImportError:
    print("Google Colab environment not detected. Files saved to local directory.").")