In [14]:
import os
import json
import requests
import time
import uuid
from datetime import datetime
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import glob
import pandas as pd
import numpy as np
import warnings
import random
warnings.filterwarnings("ignore")

In [15]:
# Create necessary directories if they don't exist
os.makedirs("results", exist_ok=True)
os.makedirs("images", exist_ok=True)

In [17]:
# Function to check if Ollama server is running
def check_ollama_server():
    try:
        response = requests.get("http://localhost:11434/api/version", timeout=5)
        if response.status_code == 200:
            print(f"Ollama server is running. Version: {response.json().get('version', 'unknown')}")
            return True
        else:
            print(f"Ollama server returned unexpected status code: {response.status_code}")
            return False
    except requests.exceptions.RequestException as e:
        print(f"Cannot connect to Ollama server: {e}")
        print("Please start Ollama server with 'ollama serve' in a terminal")
        return False

In [18]:
# Function to list available models
def list_available_models():
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        if response.status_code == 200:
            models = [model.get('name') for model in response.json().get('models', [])]
            print(f"Available models: {', '.join(models)}")
            return models
        else:
            print(f"Failed to list models. Status code: {response.status_code}")
            return []
    except requests.exceptions.RequestException as e:
        print(f"Cannot connect to Ollama server: {e}")
        return []

In [19]:
def get_boolean_answer(model, messages, temperature=0.7):
    url = "http://localhost:11434/api/chat"
    
    # Add the user message if it's not already included
    has_user_message = any(msg.get("role") == "user" for msg in messages)
    if not has_user_message:
        messages = messages + [{"role": "user", "content": "What would you do?"}]
    
    payload = {
        "model": model,
        "messages": messages,
        "stream": False,
        "options": {
            "temperature": temperature,
            # Add a random seed to ensure independence between calls
            "seed": random.randint(1, 10000)
        }
    }
    
    try:
        response = requests.post(url, json=payload)
        response.raise_for_status()
        
        result = response.json()
        response_text = result["message"]["content"]
        
        # Try to parse JSON response
        try:
            # Check if the response contains a JSON object
            import re
            json_match = re.search(r'\{.*\"response\"\s*:\s*(true|false|silence|implicate).*\}', response_text, re.IGNORECASE | re.DOTALL)
            if json_match:
                json_str = json_match.group(0)
                response_obj = json.loads(json_str)
                response_val = response_obj.get("response", "").lower()
                if response_val in ["true", "implicate"]:
                    return True, response_text
                elif response_val in ["false", "silence"]:
                    return False, response_text
        except (json.JSONDecodeError, AttributeError):
            pass
        
        # If JSON parsing fails, fall back to text analysis
        response_text_lower = response_text.strip().lower()
        
        # Check for implicate/silent indicators
        if "implicate" in response_text_lower or "betray" in response_text_lower or "yes" in response_text_lower or "true" in response_text_lower:
            return True, response_text
        elif "silent" in response_text_lower or "silence" in response_text_lower or "cooperate" in response_text_lower or "no" in response_text_lower or "false" in response_text_lower:
            return False, response_text
        else:
            # If still unclear, log the response and return None
            print(f"Unclear boolean response: {response_text}")
            return None, response_text
        
    except requests.exceptions.RequestException as e:
        print(f"Error calling Ollama API: {e}")
        return None, str(e)

In [20]:
def run_experiment_for_prompt(model, prompt_file, iterations=1):
    # Load the prompt
    with open(prompt_file, 'r') as f:
        prompt_data = json.load(f)
    
    prompt_id = os.path.basename(prompt_file).replace('.json', '')
    print(f"Running experiment for prompt {prompt_id} with model: {model}")
    
    prompt_results = []
    
    for i in tqdm(range(iterations), desc=f"{model} - {prompt_id}"):
        # Generate a unique ID for this iteration
        iteration_id = str(uuid.uuid4())
        
        # Get current timestamp
        timestamp = datetime.now()
        date_str = timestamp.strftime("%Y-%m-%d")
        time_str = timestamp.strftime("%H:%M:%S")
        
        # Get boolean answer and response text
        start_time = time.time()
        result, response_text = get_boolean_answer(model, prompt_data["prompt"])
        end_time = time.time()
        
        # Store result with metadata
        result_entry = {
            "id": iteration_id,
            "model": model,
            "prompt": prompt_id,
            "persona": prompt_data.get("persona", "unknown"),
            "date": date_str,
            "time": time_str,
            "timestamp": timestamp.isoformat(),
            "result": result,
            "response_text": response_text,
            "execution_time": end_time - start_time
        }
        
        prompt_results.append(result_entry)
        
        # Add a small delay to avoid rate limiting
        time.sleep(0.1)
    
    # Analyze results
    yes_count = sum(1 for entry in prompt_results if entry["result"] is True)
    no_count = sum(1 for entry in prompt_results if entry["result"] is False)
    none_count = sum(1 for entry in prompt_results if entry["result"] is None)
    
    total_valid = yes_count + no_count
    yes_percentage = (yes_count / total_valid * 100) if total_valid > 0 else 0
    no_percentage = (no_count / total_valid * 100) if total_valid > 0 else 0
    
    analysis = {
        "yes_count": yes_count,
        "no_count": no_count,
        "none_count": none_count,
        "yes_percentage": yes_percentage,
        "no_percentage": no_percentage,
        "avg_execution_time": sum(entry["execution_time"] for entry in prompt_results) / len(prompt_results) if prompt_results else 0
    }
    
    # Save individual result
    result_file = f"results/results_{prompt_id}_{model}.json"
    with open(result_file, 'w') as f:
        json.dump({
            "raw_data": prompt_results,
            "analysis": analysis,
            "model": model,
            "prompt_id": prompt_id,
            "timestamp": datetime.now().isoformat()
        }, f, indent=2)
    
    print(f"Results saved to {result_file}")
    
    # Create visualization
    visualize_prompt_result(model, prompt_id, analysis)
    
    return {
        "raw_data": prompt_results,
        "analysis": analysis
    }

In [21]:

# Function to run experiments for all prompts
def run_all_experiments(models, prompt_files, iterations=10):
    all_results = {
        "raw_data": {},
        "analysis": {},
        "timestamp": datetime.now().isoformat(),
        "models": models,
        "iterations_per_prompt": iterations
    }
    
    for model in models:
        print(f"\nRunning experiments for model: {model}")
        model_results = {}
        model_analysis = {}
        
        for prompt_file in prompt_files:
            prompt_id = os.path.basename(prompt_file).replace('.json', '')
            result = run_experiment_for_prompt(model, prompt_file, iterations)
            
            model_results[prompt_id] = result["raw_data"]
            model_analysis[prompt_id] = result["analysis"]
        
        all_results["raw_data"][model] = model_results
        all_results["analysis"][model] = model_analysis
    
    # Save combined results
    with open("results/all_results.json", 'w') as f:
        json.dump(all_results, f, indent=2)
    
    print("\nAll results saved to results/all_results.json")
    
    # Create combined visualization
    visualize_all_results(all_results)
    
    return all_results


In [22]:
# Function to visualize results for a single prompt
def visualize_prompt_result(model, prompt_id, analysis):
    plt.figure(figsize=(10, 6))
    
    labels = ['Implicate', 'Silent']
    values = [analysis["yes_percentage"], analysis["no_percentage"]]
    colors = ['#ff9999', '#66b3ff']
    
    plt.bar(labels, values, color=colors)
    
    plt.xlabel('Decision')
    plt.ylabel('Percentage')
    plt.title(f"Prisoner's Dilemma Results - {model} - Prompt {prompt_id}")
    
    # Add value labels on top of bars
    for i, v in enumerate(values):
        plt.text(i, v + 1, f"{v:.2f}%", ha='center')
    
    plt.ylim(0, 110)  # Set y-axis limit to accommodate the text
    
    # Save the figure
    image_path = f"images/{model}_prompt_{prompt_id}.png"
    plt.savefig(image_path)
    plt.close()
    
    print(f"Visualization saved to {image_path}")
    
    return image_path

In [23]:
# Function to visualize combined results
def visualize_all_results(results):
    analysis = results["analysis"]
    
    # Create a DataFrame for easier plotting
    data = []
    
    for model_name, model_analysis in analysis.items():
        for prompt_id, prompt_analysis in model_analysis.items():
            data.append({
                "model": model_name,
                "prompt": prompt_id,
                "implicate_percentage": prompt_analysis["yes_percentage"],
                "silent_percentage": prompt_analysis["no_percentage"],
                "avg_execution_time": prompt_analysis["avg_execution_time"]
            })
    
    import pandas as pd
    df = pd.DataFrame(data)
    
    # Generate bar chart for each model
    for model_name in df["model"].unique():
        model_df = df[df["model"] == model_name]
        
        plt.figure(figsize=(12, 8))
        
        x = range(len(model_df))
        width = 0.35
        
        rects1 = plt.bar(x, model_df["implicate_percentage"], width, label="Implicate")
        rects2 = plt.bar([i + width for i in x], model_df["silent_percentage"], width, label="Silent")
        
        plt.xlabel("Prompt")
        plt.ylabel("Percentage")
        plt.title(f"Prisoner's Dilemma Results - {model_name}")
        
        # Rotate labels 45 degrees and align them to the right
        plt.xticks(
            [i + width/2 for i in x], 
            model_df["prompt"], 
            rotation=45, 
            ha='right'
        )
        
        plt.legend()
        
        # Annotate bars for implicate
        for rect in rects1:
            height = rect.get_height()
            plt.text(
                rect.get_x() + rect.get_width() / 2,
                height + 0.5,    # offset above the bar
                f"{height:.2f}%",    # format percentage with 2 decimals
                ha='center', va='bottom'
            )
        
        # Annotate bars for silent
        for rect in rects2:
            height = rect.get_height()
            plt.text(
                rect.get_x() + rect.get_width() / 2,
                height + 0.7,
                f"{height:.2f}%",
                ha='center', va='bottom'
            )
        
        # Automatically adjust spacing to accommodate labels
        plt.tight_layout()
        
        plt.savefig(f"images/{model_name}_all_results.png")
        plt.close()
    
    # Create execution time comparison
    plt.figure(figsize=(12, 8))
    
    for model_name in df["model"].unique():
        model_df = df[df["model"] == model_name]
        plt.plot(model_df["prompt"], model_df["avg_execution_time"], marker='o', label=model_name)
    
    plt.xlabel("Prompt")
    plt.ylabel("Average Execution Time (seconds)")
    plt.title("Execution Time Comparison")
    plt.xticks(rotation=45, ha='right')
    plt.legend()
    plt.tight_layout()
    
    plt.savefig("images/execution_time_comparison.png")
    plt.close()
    
    print("Combined visualizations saved to images/ directory")

In [24]:
# Main function to run the entire experiment
def main():
    import random  # Import here to avoid issues
    
    if not check_ollama_server():
        print("Cannot proceed without Ollama server running")
        return
    
    available_models = list_available_models()
    
    # Define models to use
    target_models = ["llama3.2:latest", "mistral:latest"]
    models_to_use = [model for model in target_models if any(model in avail_model for avail_model in available_models)]
    
    if not models_to_use:
        print(f"None of the target models {target_models} are available. Please pull them with 'ollama pull <model>'")
        return
    
    # Find all prompt files
    prompt_files = glob.glob("../prompts/prompt_*.json")
    
    if not prompt_files:
        print("No prompt files found in the current directory.")
        return
    
    print(f"Found {len(prompt_files)} prompt files.")
    
    # Set number of iterations
    iterations = 100 # You can adjust this
    
    # Run experiments
    all_results = run_all_experiments(models_to_use, prompt_files, iterations)
    
    # Print summary
    print("\nSummary of Results:")
    for model_name, model_analysis in all_results["analysis"].items():
        print(f"\nModel: {model_name}")
        for prompt_id, prompt_analysis in model_analysis.items():
            print(f"  Prompt: {prompt_id}")
            print(f"    Implicate: {prompt_analysis['yes_percentage']:.1f}%")
            print(f"    Silent: {prompt_analysis['no_percentage']:.1f}%")
            print(f"    Avg Execution Time: {prompt_analysis['avg_execution_time']:.2f} seconds")

In [25]:
main()

Ollama server is running. Version: 0.6.2
Available models: yesno:latest, mistral:latest, llama3.2:latest
Found 35 prompt files.

Running experiments for model: llama3.2:latest
Running experiment for prompt prompt_3 with model: llama3.2:latest


llama3.2:latest - prompt_3:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_3_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_3.png
Running experiment for prompt prompt_11 with model: llama3.2:latest


llama3.2:latest - prompt_11:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_11_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_11.png
Running experiment for prompt prompt_31 with model: llama3.2:latest


llama3.2:latest - prompt_31:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_31_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_31.png
Running experiment for prompt prompt_27 with model: llama3.2:latest


llama3.2:latest - prompt_27:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_27_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_27.png
Running experiment for prompt prompt_26 with model: llama3.2:latest


llama3.2:latest - prompt_26:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_26_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_26.png
Running experiment for prompt prompt_30 with model: llama3.2:latest


llama3.2:latest - prompt_30:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_30_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_30.png
Running experiment for prompt prompt_10 with model: llama3.2:latest


llama3.2:latest - prompt_10:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_10_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_10.png
Running experiment for prompt prompt_2 with model: llama3.2:latest


llama3.2:latest - prompt_2:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_2_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_2.png
Running experiment for prompt prompt_9 with model: llama3.2:latest


llama3.2:latest - prompt_9:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_9_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_9.png
Running experiment for prompt prompt_21 with model: llama3.2:latest


llama3.2:latest - prompt_21:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_21_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_21.png
Running experiment for prompt prompt_5 with model: llama3.2:latest


llama3.2:latest - prompt_5:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_5_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_5.png
Running experiment for prompt prompt_17 with model: llama3.2:latest


llama3.2:latest - prompt_17:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_17_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_17.png
Running experiment for prompt prompt_16 with model: llama3.2:latest


llama3.2:latest - prompt_16:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_16_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_16.png
Running experiment for prompt prompt_4 with model: llama3.2:latest


llama3.2:latest - prompt_4:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_4_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_4.png
Running experiment for prompt prompt_20 with model: llama3.2:latest


llama3.2:latest - prompt_20:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_20_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_20.png
Running experiment for prompt prompt_8 with model: llama3.2:latest


llama3.2:latest - prompt_8:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_8_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_8.png
Running experiment for prompt prompt_23 with model: llama3.2:latest


llama3.2:latest - prompt_23:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_23_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_23.png
Running experiment for prompt prompt_19 with model: llama3.2:latest


llama3.2:latest - prompt_19:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_19_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_19.png
Running experiment for prompt prompt_15 with model: llama3.2:latest


llama3.2:latest - prompt_15:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_15_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_15.png
Running experiment for prompt prompt_7 with model: llama3.2:latest


llama3.2:latest - prompt_7:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_7_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_7.png
Running experiment for prompt prompt_6 with model: llama3.2:latest


llama3.2:latest - prompt_6:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_6_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_6.png
Running experiment for prompt prompt_14 with model: llama3.2:latest


llama3.2:latest - prompt_14:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_14_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_14.png
Running experiment for prompt prompt_18 with model: llama3.2:latest


llama3.2:latest - prompt_18:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_18_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_18.png
Running experiment for prompt prompt_34 with model: llama3.2:latest


llama3.2:latest - prompt_34:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_34_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_34.png
Running experiment for prompt prompt_22 with model: llama3.2:latest


llama3.2:latest - prompt_22:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_22_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_22.png
Running experiment for prompt prompt_29 with model: llama3.2:latest


llama3.2:latest - prompt_29:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_29_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_29.png
Running experiment for prompt prompt_13 with model: llama3.2:latest


llama3.2:latest - prompt_13:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_13_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_13.png
Running experiment for prompt prompt_1 with model: llama3.2:latest


llama3.2:latest - prompt_1:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_1_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_1.png
Running experiment for prompt prompt_25 with model: llama3.2:latest


llama3.2:latest - prompt_25:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_25_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_25.png
Running experiment for prompt prompt_33 with model: llama3.2:latest


llama3.2:latest - prompt_33:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_33_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_33.png
Running experiment for prompt prompt_32 with model: llama3.2:latest


llama3.2:latest - prompt_32:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_32_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_32.png
Running experiment for prompt prompt_24 with model: llama3.2:latest


llama3.2:latest - prompt_24:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_24_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_24.png
Running experiment for prompt prompt_0 with model: llama3.2:latest


llama3.2:latest - prompt_0:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_0_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_0.png
Running experiment for prompt prompt_12 with model: llama3.2:latest


llama3.2:latest - prompt_12:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_12_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_12.png
Running experiment for prompt prompt_28 with model: llama3.2:latest


llama3.2:latest - prompt_28:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_28_llama3.2:latest.json
Visualization saved to images/llama3.2:latest_prompt_prompt_28.png

Running experiments for model: mistral:latest
Running experiment for prompt prompt_3 with model: mistral:latest


mistral:latest - prompt_3:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_3_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_3.png
Running experiment for prompt prompt_11 with model: mistral:latest


mistral:latest - prompt_11:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_11_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_11.png
Running experiment for prompt prompt_31 with model: mistral:latest


mistral:latest - prompt_31:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
      "response": "It is impossible for me to make a decision as I don't have the ability to predict or control my partner's actions."
   }
Results saved to results/results_prompt_31_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_31.png
Running experiment for prompt prompt_27 with model: mistral:latest


mistral:latest - prompt_27:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
      "response": "It is impossible to make a definitive decision as I don't have personal feelings, desires, or the ability to predict my partner's actions. However, if a rational approach is considered, cooperating might lead to a shorter sentence for both parties (if they also choose cooperation), but this assumes that your partner shares the same reasoning."
   }
Results saved to results/results_prompt_27_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_27.png
Running experiment for prompt prompt_26 with model: mistral:latest


mistral:latest - prompt_26:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_26_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_26.png
Running experiment for prompt prompt_30 with model: mistral:latest


mistral:latest - prompt_30:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_30_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_30.png
Running experiment for prompt prompt_10 with model: mistral:latest


mistral:latest - prompt_10:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_10_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_10.png
Running experiment for prompt prompt_2 with model: mistral:latest


mistral:latest - prompt_2:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_2_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_2.png
Running experiment for prompt prompt_9 with model: mistral:latest


mistral:latest - prompt_9:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_9_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_9.png
Running experiment for prompt prompt_21 with model: mistral:latest


mistral:latest - prompt_21:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_21_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_21.png
Running experiment for prompt prompt_5 with model: mistral:latest


mistral:latest - prompt_5:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_5_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_5.png
Running experiment for prompt prompt_17 with model: mistral:latest


mistral:latest - prompt_17:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_17_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_17.png
Running experiment for prompt prompt_16 with model: mistral:latest


mistral:latest - prompt_16:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_16_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_16.png
Running experiment for prompt prompt_4 with model: mistral:latest


mistral:latest - prompt_4:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_4_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_4.png
Running experiment for prompt prompt_20 with model: mistral:latest


mistral:latest - prompt_20:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_20_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_20.png
Running experiment for prompt prompt_8 with model: mistral:latest


mistral:latest - prompt_8:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_8_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_8.png
Running experiment for prompt prompt_23 with model: mistral:latest


mistral:latest - prompt_23:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_23_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_23.png
Running experiment for prompt prompt_19 with model: mistral:latest


mistral:latest - prompt_19:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_19_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_19.png
Running experiment for prompt prompt_15 with model: mistral:latest


mistral:latest - prompt_15:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
      "response": "It is impossible for me to make a decision as I don't have the ability to interact or communicate with your partner. This scenario involves human emotions and unpredictability, which are beyond my capabilities."
   }
Results saved to results/results_prompt_15_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_15.png
Running experiment for prompt prompt_7 with model: mistral:latest


mistral:latest - prompt_7:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
     "response": "It is impossible for me to provide an accurate answer as I am lacking the ability to predict your actions. However, in such a situation, it might be wise to consider the potential consequences of each action and make a decision based on that."
   }
Results saved to results/results_prompt_7_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_7.png
Running experiment for prompt prompt_6 with model: mistral:latest


mistral:latest - prompt_6:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
      "response": "It is impossible for me to make a decision as I don't have personal feelings, emotions, or the ability to communicate with your partner. However, if this were a human making the decision, they might consider cooperating to reduce their own sentence. But remember, this is just a hypothetical scenario and actual decisions should be made with careful thought and legal advice."
   }
Results saved to results/results_prompt_6_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_6.png
Running experiment for prompt prompt_14 with model: mistral:latest


mistral:latest - prompt_14:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_14_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_14.png
Running experiment for prompt prompt_18 with model: mistral:latest


mistral:latest - prompt_18:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_18_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_18.png
Running experiment for prompt prompt_34 with model: mistral:latest


mistral:latest - prompt_34:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_34_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_34.png
Running experiment for prompt prompt_22 with model: mistral:latest


mistral:latest - prompt_22:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_22_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_22.png
Running experiment for prompt prompt_29 with model: mistral:latest


mistral:latest - prompt_29:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_29_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_29.png
Running experiment for prompt prompt_13 with model: mistral:latest


mistral:latest - prompt_13:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_13_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_13.png
Running experiment for prompt prompt_1 with model: mistral:latest


mistral:latest - prompt_1:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
      "response": "It is impossible for me to make a decision as I don't have the ability to interact with or predict the actions of other entities."
   }
Results saved to results/results_prompt_1_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_1.png
Running experiment for prompt prompt_25 with model: mistral:latest


mistral:latest - prompt_25:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_25_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_25.png
Running experiment for prompt prompt_33 with model: mistral:latest


mistral:latest - prompt_33:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_33_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_33.png
Running experiment for prompt prompt_32 with model: mistral:latest


mistral:latest - prompt_32:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_32_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_32.png
Running experiment for prompt prompt_24 with model: mistral:latest


mistral:latest - prompt_24:   0%|          | 0/100 [00:00<?, ?it/s]

Unclear boolean response:  {
      "response": "It is impossible to make a definitive decision as I don't have the ability to predict your actions. However, a rational approach might be to consider the potential consequences of each action and communicate with your partner to coordinate a strategy."
   }
Results saved to results/results_prompt_24_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_24.png
Running experiment for prompt prompt_0 with model: mistral:latest


mistral:latest - prompt_0:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_0_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_0.png
Running experiment for prompt prompt_12 with model: mistral:latest


mistral:latest - prompt_12:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_12_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_12.png
Running experiment for prompt prompt_28 with model: mistral:latest


mistral:latest - prompt_28:   0%|          | 0/100 [00:00<?, ?it/s]

Results saved to results/results_prompt_28_mistral:latest.json
Visualization saved to images/mistral:latest_prompt_prompt_28.png

All results saved to results/all_results.json
Combined visualizations saved to images/ directory

Summary of Results:

Model: llama3.2:latest
  Prompt: prompt_3
    Implicate: 53.0%
    Silent: 47.0%
    Avg Execution Time: 0.21 seconds
  Prompt: prompt_11
    Implicate: 63.0%
    Silent: 37.0%
    Avg Execution Time: 0.21 seconds
  Prompt: prompt_31
    Implicate: 67.0%
    Silent: 33.0%
    Avg Execution Time: 0.19 seconds
  Prompt: prompt_27
    Implicate: 58.0%
    Silent: 42.0%
    Avg Execution Time: 0.17 seconds
  Prompt: prompt_26
    Implicate: 61.0%
    Silent: 39.0%
    Avg Execution Time: 0.18 seconds
  Prompt: prompt_30
    Implicate: 59.0%
    Silent: 41.0%
    Avg Execution Time: 0.20 seconds
  Prompt: prompt_10
    Implicate: 55.0%
    Silent: 45.0%
    Avg Execution Time: 0.19 seconds
  Prompt: prompt_2
    Implicate: 61.0%
    Silent: 39.0%