This notebook contains all code necessary to run the experiments and produce the graphs in the paper "Conditional and Modal Reasoning in Large Language Models" (https://arxiv.org/abs/2401.17169) by Wesley H. Holliday (wesholliday@berkeley.edu) and Matthew Mandelkern (mandelkern@nyu.edu).

The notebook was created using Python 3.11.6 and Seaborn 0.12.0.

To collect responses from one of the open-source LLMs, use LM Studio (https://lmstudio.ai), v0.2.11 or later, to set up a local inference server with base_url=http://localhost:1234/v1.

In [None]:
from openai import OpenAI
import json
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import FuncFormatter
import seaborn as sns
import os
import re
import numpy as np
from tqdm.notebook import tqdm

# Outline

### 1. Setup

### 2. Collect model response data

### 3. Create dataframe

### 4. Create figures for paper

### 5. Further analysis mentioned in text

# 1. Setup

### Models

In [None]:
model_dict ={0: ("code_llama_7B", "Code Llama 7B", "code llama instruct 7B Q6_K gguf"),
             1: ("code_llama_13B", "Code Llama 13B", "code llama instruct 13B Q6_K gguf"),
             2: ("code_llama_34B", "Code Llama 34B", "code llama instruct 34B Q6_K gguf"),
             3: ("gpt_3_5_turbo_0613", "GPT-3.5 Turbo (0613)", "gpt-3.5-turbo-0613"), # This is the model that 'gpt-3.5-turbo' points to in the Open AI API as of January 24, 2024
             #4: ("gpt_3_5_turbo_1106", "GPT-3.5 Turbo (1106)", "gpt-3.5-turbo-1106"), # We also tested this model, but it is not included in the paper
             #5: ("gpt_4_0314", "GPT-4 (0314)", "gpt-4-0314"), # We also tested this model, but it is not included in the paper
             6: ("gpt_4_0613", "GPT-4 (0613)", "gpt-4-0613"), # This is the model that 'gpt-4' points to in the Open AI API as of January 24, 2024
             #7: ("gpt_4_turbo_1106", "GPT-4 Turbo", "gpt-4-1106-preview"), # We also tested this model, but it is not included in the paper except in a footnote
             8: ("llama_2_chat_7B", "Llama 2 Chat 7B", "llama 2 chat 7B Q6_K gguf"),
             9: ("llama_2_chat_13B", "Llama 2 Chat 13B", "llama 2 chat 13B Q6_K gguf"), 
             10: ("llama_2_chat_70B", "Llama 2 Chat 70B", "llama 2 chat 70B Q6_K gguf"),
             11: ("mistral_instruct", "Mistral 7B", "mistral instruct v0 2 7B Q6_K gguf"), 
             12: ("mixtral_instruct", "Mixtral 8x7B", "mixtral 8x instruct v0 1 7B Q6_K gguf"), 
             13: ("phi_2", "Phi-2", "phi 2 3B Q6_K gguf"), 
             14: ("yi_chat_34B", "Yi Chat 34B", "yi chat 34B Q6_K gguf"), 
}

### Select a model to test

In [None]:
model_num = 0 # This is the model that will be used in Section 2
model_short_name = model_dict[model_num][0]
model_display_name = model_dict[model_num][1]
model_full_name = model_dict[model_num][2]
print(model_short_name)

### Select prompts

In [None]:
selected_prompt_classes = ['MT','MTx','uMT','vMT','MTo','MTox', # versions of Modus Tollens
                           'MP','MPx','uMP','vMP','MPo','MPox', # versions of Modus Ponens

                           'AC','ACx','uAC','vAC','ACo','ACox', # versions of Affirming the Consequent
                           'LAC','cAC','LcAC','dAC','LdAC','sAC','snAC','gAC','gnAC', # more versions of Affirming the Consequent with altered phrasing
                           'CONV','CONVx','uCONV','vCONV', # versions of Conversion

                           'DA','DAx','uDA','vDA','DAo','DAox', # versions of Denying the Antecedent
                           'INV','INVx','uINV','vINV', # versions of Inversion

                           'DS','DSx','uDS','vDS','DSo','DSox','DSf','DSfx','DSfo','DSfox', # versions of Disjunctive Syllogism   
                           
                           'CT','CTx','CTn','CTnx','CTnd','uCTnd','vCTnd', # versions of Contraposition
                           'ASd','ASx','uASd','vASd', # versions of Antecedent Strengthening

                           'MiN','MiNx','uMiN','vMiN','NMu','NMux','uNMu','vNMu', # versions of Duality inferences 

                           'MTmu','MTmux','uMTmu','vMTmu','MTmuo','MTmuox', # versions of Modus Tollens Must
                           'MTmi','MTmix','uMTmi','vMTmi','MTmio','MTmiox', # versions of Modus Tollens Might

                           'DSmu','DSmux','uDSmu','vDSmu','DSmuo','DSmuox','DSmuf','DSmufx','DSmufo','DSmufox', # versions of Disjunctive Syllogism Must
                           'DSmi','DSmix','uDSmi','vDSmi','DSmio','DSmiox','DSmif','DSmifx','DSmifo','DSmifox', # versions of Disjunctive Syllogism Might
]

user_prompts = list()
prompt_classes = list()

for file in os.listdir('prompts'):
    if file.endswith('.json'):
        prompt_class = file[:-5]
        if prompt_class in selected_prompt_classes:
            prompt_classes.append(file[:-5])
            with open('prompts/' + file) as f:
                user_prompts += json.load(f)
        else:
            print('Skipping', file[:-5])

print(sorted(prompt_classes))

In [None]:
system_prompt = "Answer only with 'yes' or 'no' and nothing else."

temperatures = [0, 0.5, 1]

num_prompts_per_temp = {0: 1, 0.5: 20, 1: 20}

### Helper functions

A number of the LLMs often disobey the system prompt to say *nothing else* except 'yes' or 'no'. Thus, we extract 'yes' or 'no' from their response using regular expressions.

In [None]:
def contains_yes(response):
    # Regular expression pattern
    # \b represents a word boundary
    # 'yes' is the word to search
    # [.,;:-]? allows for an optional period, comma, semicolon, colon, or dash after "yes"
    # re.IGNORECASE makes the search case-insensitive
    pattern = r'\byes\b[.,;:-]?'

    # Search for the pattern in the response
    return re.search(pattern, response, re.IGNORECASE) is not None

def contains_no(response):
    # Regular expression pattern
    # \b represents a word boundary
    # 'no' is the word to search
    # [.,;:-]? allows for an optional period, comma, semicolon, colon, or dash after "no"
    # re.IGNORECASE makes the search case-insensitive
    pattern = r'\bno\b[.,;:-]?'

    # Search for the pattern in the response
    return re.search(pattern, response, re.IGNORECASE) is not None


# 2. Collect model response data

We are interested in not only whether an LLM operating at zero temperature gives an affirmative answer or negative answer to a logical inference question, but also the frequency with which an LLM operating at non-zero temperature gives an affirmative answer or negative answer to a logical inference question. Elsewhere we are doing an analysis with log probabilities, but here we take a frequentist approach.

In [None]:
get_responses = False # Set to True to get responses from the API. Set to False to just analyze the data that has already been collected.

if get_responses:
    print("Model:", model_short_name)
    print("")

    for up in tqdm(user_prompts):
        prompt_class, prompt_num, user_prompt = up

        for temperature in temperatures:

            # if the data file already exists, continue
            if os.path.exists(f'data/{model_short_name}/{prompt_class}/{model_short_name}_{prompt_class}_{prompt_num}_{temperature}.json'):
                continue

            print("Prompt:", user_prompt)
            print("Temperature:", temperature)
            print("")

            data = {
                "user_prompt": user_prompt,
                "system_prompt": system_prompt,
                "model": model_full_name,
                "temperature": temperature,
                "responses": []
            }

            for n in range(num_prompts_per_temp[temperature]):

                if model_short_name in ['gpt_4_turbo_1106', 'gpt_4_0613', 'gpt_4_0314', 'gpt_3_5_turbo_1106', 'gpt_3_5_turbo_0613']:
                    
                    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
                    completion = client.chat.completions.create(
                    model=model_full_name,
                    messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_prompt}],
                    temperature=temperature,
                    max_tokens=4,
                )
                
                else:
                    client = OpenAI(base_url="http://localhost:1234/v1", api_key="not-needed") 
                    completion = client.chat.completions.create(
                    model="local-model",
                    messages=[
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_prompt}],
                    temperature=temperature,
                    max_tokens=8,
                )

                print(completion.choices[0].message)

                # get a JSON representation of the message
                message_json = completion.choices[0].message.model_dump_json()
                data["responses"].append(json.loads(message_json))

                # if all the responses are 'yes' or all response are 'no' by some checkpoint number of iterations, then break
                if model_short_name in ['gpt_4_turbo_1106', 'gpt_4_0613', 'gpt_4_0314']:
                    impatience = 4
                else:
                    impatience = 2

                if n == (num_prompts_per_temp[temperature] // impatience) - 1:
                    if all(contains_yes(response) for response in [data["responses"][k]["content"] for k in range(n)]) and not any(contains_no(response) for response in [data["responses"][k]["content"] for k in range(n)]):
                        break
                    elif all(contains_no(response) for response in [data["responses"][k]["content"] for k in range(n)]) and not any(contains_yes(response) for response in [data["responses"][k]["content"] for k in range(n)]):
                        break

            print("")
            
            # create the appropriate directory if it doesn't already exist
            if not os.path.exists(f'data/{model_short_name}/{prompt_class}'):
                os.makedirs(f'data/{model_short_name}/{prompt_class}')

            # Save the data to a JSON file
            with open(f'data/{model_short_name}/{prompt_class}/{model_short_name}_{prompt_class}_{prompt_num}_{temperature}.json', 'w') as json_file:
                json.dump(data, json_file, indent=4)

# 3. Create dataframe

### Additional helper functions

We noticed that models sometimes responded with 'we can' instead of 'yes' and 'we cannot' instead of 'no', so we allow these responses (but flag them below).

In [None]:
def contains_can(response):
    # Regular expression pattern
    # \b represents a word boundary
    # 'can' is the word to search
    # (?!not\b|'t\b| be\b) is a negative lookahead that ensures 'can' is not followed by 'not', 't', or ' be'
    # [.,;:-!]? allows for an optional period, comma, semicolon, colon, or dash after "can"
    # re.IGNORECASE makes the search case-insensitive
    pattern = r'\bcan\b(?!not\b|\'t\b| be\b)[.,;:-]?'

    # Search for the pattern in the response
    return re.search(pattern, response, re.IGNORECASE) is not None

def contains_cannot(response):
    # Regular expression pattern
    # \b represents a word boundary
    # '(?:cannot|can't)' is a non-capturing group that matches either 'cannot' or 'can't'
    # [.,;:-!]? allows for an optional period, comma, semicolon, colon, or dash after "cannot" or "can't"
    # re.IGNORECASE makes the search case-insensitive
    pattern = r'\b(?:cannot|can\'t)\b[.,;:-]?'

    # Search for the pattern in the response
    return re.search(pattern, response, re.IGNORECASE) is not None

print(contains_can("We can infer that"))
print(contains_can("We can't infer that"))
print(contains_can("We cannot infer that"))
print(contains_can("The argument's structure can be represented..."))
print("")
print(contains_cannot("We can infer that"))
print(contains_cannot("We can't infer that"))
print(contains_cannot("We cannot infer that"))
print(contains_cannot("The argument's structure can be represented..."))


### Process data to create dataframe

In [None]:
verbose = False # Set to True to print out warnings and errors

df = pd.DataFrame(columns=["prompt_class", "prompt_num", "prompt", "model", "temperature", "yes_count", "no_count", "num_trials", "yes_percent", "no_percent"])

model_display_names = [model_dict[mod][1] for mod in model_dict.keys()]
queries_per_model = {model_display_name: 0 for model_display_name in model_display_names}

error_both_response = {model_display_name: 0 for model_display_name in model_display_names}
error_no_response = {model_display_name: 0 for model_display_name in model_display_names}

can_or_cannot_response = {model_display_name: 0 for model_display_name in model_display_names}
can_and_cannot_response = {model_display_name: 0 for model_display_name in model_display_names}

for up in tqdm(user_prompts):
    prompt_class, prompt_num, user_prompt = up
    for mod in model_dict.keys():
        model_short_name = model_dict[mod][0]
        model_display_name = model_dict[mod][1]
        for temperature in temperatures:
            with open(f'data/{model_short_name}/{prompt_class}/{model_short_name}_{prompt_class}_{prompt_num}_{temperature}.json', 'r') as json_file:
                data = json.load(json_file)
                    
                # count the number of times the models response contained "yes" or "no" (case insensitive)
                yes_count = 0
                no_count = 0

                for response in data["responses"]:
                    
                    queries_per_model[model_display_name] += 1

                    if contains_yes(response["content"]) and not contains_no(response["content"]):
                        yes_count += 1
                    elif contains_no(response["content"]) and not contains_yes(response["content"]):
                        no_count += 1
                        
                    elif contains_yes(response["content"]) and contains_no(response["content"]):
                        error_both_response[model_display_name] += 1

                        if verbose:
                            print("ERROR: response contains both 'yes' and 'no'.")
                            print("Model:", model_display_name)
                            print("Temperature:", temperature)
                            print("Prompt:", user_prompt)
                            print("Response:", response["content"])
                            print("")

                    elif contains_can(response["content"]) and not contains_cannot(response["content"]):
                        yes_count += 1
                        can_or_cannot_response[model_display_name] += 1

                        if verbose:
                            print("WARNING: response contains 'can' instead of 'yes'.")
                            print("Model:", model_display_name)
                            print("Temperature:", temperature)
                            print("Prompt:", user_prompt)
                            print("Response:", response["content"])
                            print("")

                    elif contains_cannot(response["content"]) and not contains_can(response["content"]):
                        no_count += 1
                        can_or_cannot_response[model_display_name] += 1

                        if verbose:
                            print("WARNING: response contains 'cannot' instead of 'no'.")
                            print("Model:", model_display_name)
                            print("Temperature:", temperature)
                            print("Prompt:", user_prompt)
                            print("Response:", response["content"])
                            print("")

                    elif contains_can(response["content"]) and contains_cannot(response["content"]):
                        error_both_response[model_display_name] += 1
                        can_and_cannot_response[model_display_name] += 1

                        if verbose:
                            print("ERROR: response contains both 'can' and 'cannot'.")
                            print("Model:", model_display_name)
                            print("Temperature:", temperature)
                            print("Prompt:", user_prompt)
                            print("Response:", response["content"])
                            print("")

                    else:
                        error_no_response[model_display_name] += 1

                        if verbose:
                            print("ERROR: could not extract an answer.")
                            print("Model:", model_display_name)
                            print("Temperature:", temperature)
                            print("Prompt:", user_prompt)
                            print("Response:", response["content"])
                            print("")               

                # append the data to the dataframe
                new_row = {"prompt_class": prompt_class, 
                           "prompt_num": prompt_num, 
                           "prompt": user_prompt, 
                           "model": model_display_name, 
                           "temperature": temperature, 
                           "yes_count": yes_count, 
                           "no_count": no_count, 
                           "num_trials": len(data["responses"]), 
                           "yes_percent": yes_count/len(data["responses"]), 
                           "no_percent": - no_count/len(data["responses"])
                           }
                
                new_row_df = pd.DataFrame([new_row])
                df = pd.concat([df, new_row_df], ignore_index=True)

if not os.path.exists('results'):
    os.makedirs('results')

# save the dataframe to a csv file
df.to_csv("results/results.csv")

# save the dataframe as a json file
df.to_json("results/results.json", orient="records")

print("Total number of queries:", queries_per_model)
print("Both responses errors:", error_both_response)
print("Neither response errors:", error_no_response)
print("Can or cannot response:", can_or_cannot_response)
print("Can and cannot response:", can_and_cannot_response)

print("")

df

# 3. Create figures for paper

In [None]:
# Dictionary with the correct answers (counting the expert human judgments from the cited literature as correct in the modal cases)
correct_answers = {
    'AC': 'no',
    'ACo': 'no',
    'ACox': 'no',
    'ACx': 'no',
    'ASd': 'no',
    'ASx': 'no',
    'cAC': 'no',
    'CONV': 'no',
    'CONVx': 'no',
    'CT': 'no', # According to the modal analysis of conditionals, CT is not a valid form of inference, but the instances we tested are reasonable inferences
    'CTx': 'no',
    'CTn': 'no', # According to the modal analysis of conditionals, CTn is not a valid form of inference, but the instances we tested are reasonable inferences
    'CTnd': 'no',
    'CTnx': 'no',
    'DA': 'no',
    'dAC': 'no',
    'DAx': 'no',
    'DAo': 'no',
    'DAox': 'no',
    'uAC': 'no',
    'uDA': 'no',
    'DS': 'yes',
    'DSf': 'yes',
    'DSfo': 'yes',
    'DSfox': 'yes',
    'DSfx': 'yes',
    'DSmi': 'no',
    'DSmif': 'no',
    'DSmifo': 'no',
    'DSmifox': 'no',
    'DSmifx': 'no',
    'DSmio': 'no',
    'DSmiox': 'no',
    'DSmix': 'no',
    'DSmu': 'no',
    'DSmuf': 'no',
    'DSmufo': 'no',
    'DSmufx': 'no',
    'DSmufox': 'no',
    'DSmuo': 'no',
    'DSmuox': 'no',
    'DSmux': 'no',
    'DSo': 'yes',
    'DSox': 'yes',
    'DSx': 'yes',
    'gAC': 'no',
    'gnAC': 'no',
    'INV': 'no',
    'INVx': 'no',
    'LAC': 'no',
    'LcAC': 'no',
    'LdAC': 'no',
    'MiN': 'yes',
    'MiNx': 'yes',
    'MP': 'yes',
    'MPo': 'yes',
    'MPox': 'yes',
    'MPx': 'yes',
    'MT': 'yes',
    'MTmi': 'no',
    'MTmio': 'no',
    'MTmiox': 'no',
    'MTmix': 'no',
    'MTmu': 'no',
    'MTmuo': 'no',
    'MTmuox': 'no',
    'MTmux': 'no',
    'MTo': 'yes',
    'MTox': 'yes',
    'MTx': 'yes',
    'mvMiN': 'yes',
    'mvNmu': 'yes',
    'NMu': 'yes',
    'NMux': 'yes',
    'sAC': 'no',
    'snAC': 'no',
    'uAC': 'no',
    'uASd': 'no',
    'uCONV': 'no',
    'uDA': 'no',
    'uDS': 'yes',
    'uDSmi': 'no',
    'uDSmu': 'no',
    'uINV': 'no',
    'uMiN': 'yes',
    'uMP': 'yes',
    'uMT': 'yes',
    'uMTmi': 'no',
    'uMTmu': 'no',
    'uNMu': 'yes',
    'vAC': 'no',
    'vASd': 'no',
    'vCONV': 'no',
    'vCTnd': 'no',
    'vDA': 'no',
    'vDS': 'yes',
    'vDSmi': 'no',
    'vDSmu': 'no',
    'vINV': 'no',
    'vMiN': 'yes',
    'vMP': 'yes',
    'vMT': 'yes',
    'vMTmi': 'no',
    'vMTmu': 'no',
    'vNMu': 'yes',
}

## Fig. 1: Performance summary

In [None]:
prompt_classes_for_summary = ['MT', 'MTx', 'MTo', 'MTox',
                           'MP', 'MPx', 'MPo', 'MPox',
                           'DS', 'DSx', 'DSo', 'DSox', 
                           'AC', 'ACx', 'ACo', 'ACox',
                           'DA', 'DAx', 'DAo', 'DAox',
]

# Function to calculate the correct response percentage
def correct_response_percentage(row):
    correct_answer = correct_answers[row['prompt_class']]
    return abs(row[f'{correct_answer}_percent']) * 100

# Applying the function to each row
df['correct_response_percentage'] = df.apply(correct_response_percentage, axis=1)

# Include only the selected prompt classes
filtered_df = df[df['prompt_class'].isin(prompt_classes_for_summary)]

# Calculating the average correct response percentage for each model
mean_correct_response = filtered_df.groupby('model').correct_response_percentage.mean().reset_index()

# Sorting the models by average correct response percentage in descending order
mean_correct_response = mean_correct_response.sort_values('correct_response_percentage', ascending=False)

# Adjusting color mapping bounds
min_performance = mean_correct_response['correct_response_percentage'].min()
max_performance = mean_correct_response['correct_response_percentage'].max()
adjusted_lower_bound = 0.1
more_adjusted_upper_bound = 0.325
mean_correct_response['more_fine_tuned_normalized_performance'] = mean_correct_response['correct_response_percentage'].apply(
    lambda x: min(max((x - min_performance) / (max_performance - min_performance) * more_adjusted_upper_bound + adjusted_lower_bound, 0), 1))
# Mapping each model's performance to a color in the more fine-tuned gist_rainbow palette
mean_correct_response['more_fine_tuned_color'] = mean_correct_response['more_fine_tuned_normalized_performance'].apply(lambda x: plt.cm.gist_rainbow(x))

# Plotting the graph
plt.figure(figsize=(9,6))
sns.barplot(x='correct_response_percentage', y='model', data=mean_correct_response, palette=mean_correct_response['more_fine_tuned_color'])
plt.xlabel('Average correct answer frequency', fontsize=16)
plt.ylabel('', fontsize=14)
plt.title('Summary of performance on some simple inferences', fontsize=16)
plt.xticks([0, 25, 50, 75, 100], ['0%', '25%', '50%', '75%', '100%'], fontsize=16)
plt.yticks(fontsize=16)
plt.xlim(0, 100)
plt.tight_layout()

if not os.path.exists('graphs'):
    os.makedirs('graphs')

# Save the plot as a PDF file
pdf_file_path = 'graphs/summary_graph_multicolor.pdf'
plt.savefig(pdf_file_path, format='pdf')

#plt.close()
plt.show()


## Top of Fig. 2: Queries and Errors per Model

In [None]:
# Prepare data for the heatmap
heatmap_data = pd.DataFrame({
    'Queries': queries_per_model,
    'Both Responses Errors': error_both_response,
    'Neither Response Errors': error_no_response
})

# Set up the matplotlib figure
plt.figure(figsize=(8, 4))

# Create a mask for the Queries per Model column
mask = np.zeros_like(heatmap_data, dtype=bool)
mask[:, 0] = True

# Use rc_context to set font size locally
with plt.rc_context({'font.size': 12}):
    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(heatmap_data, annot=True, fmt="d", cmap='Reds', mask=mask, cbar_kws={'label': 'Error Count'}, cbar=False)
    plt.title("Queries and Errors per Model",fontsize=16)
    plt.ylabel("Model")
    plt.xticks(rotation=0, ha='center')
    plt.yticks(rotation=0)

    # Manually annotate the 'Queries per Model' column
    for i, value in enumerate(heatmap_data['Queries']):
        plt.text(0.5, i + 0.5, value, horizontalalignment='center', verticalalignment='center', color='black')

    # Remove ticks on the x and y axes
    plt.tick_params(axis='both', which='both', bottom=False, top=False, left=False, right=False)

    # Save the plot as a PDF file with a tight layout
    plt.savefig('results/queries_and_errors.pdf', bbox_inches='tight')

#plt.close()
plt.show()

## Bottom of Fig. 2: Correlations

In [None]:
prompt_classes_for_correlations = ['AC', 'DA', 'DS', 'DSmi', 'DSmu', 'MP', 'MT', 'MTmi', 'MTmu', 'MiN', 
                                   'ACx', 'DAx', 'DSx', 'DSmix', 'DSmux', 'MPx', 'MTx', 'MTmix', 'MTmux', 'MiNx']

In [None]:
# Function to calculate average 'yes_percent' for 'x' and non-'x' versions across models and temperatures
def calculate_averages_by_temp(df):
    averages_dict = {}
    for prompt_class in set(pc.rstrip('x') for pc in df['prompt_class']):
        prompt_class_x = prompt_class + 'x'
        for model in df['model'].unique():
            for temp in df['temperature'].unique():
                avg_non_x = df[(df['prompt_class'] == prompt_class) & (df['model'] == model) & (df['temperature'] == temp)]['yes_percent'].mean()
                avg_x = df[(df['prompt_class'] == prompt_class_x) & (df['model'] == model) & (df['temperature'] == temp)]['yes_percent'].mean()
                averages_dict[(prompt_class, model, temp)] = {'avg_non_x': avg_non_x, 'avg_x': avg_x}
    return averages_dict

# Function to calculate correlation across models by temperature for each prompt class
def calculate_correlation_across_models_by_temp(averages_dict):
    correlation_dict = {}
    for (prompt_class, model, temp) in averages_dict:
        key = (prompt_class, temp)
        avg_non_x = averages_dict[(prompt_class, model, temp)]['avg_non_x']
        avg_x = averages_dict[(prompt_class, model, temp)]['avg_x']
        if key not in correlation_dict:
            correlation_dict[key] = {'avg_non_x': [], 'avg_x': []}
        correlation_dict[key]['avg_non_x'].append(avg_non_x)
        correlation_dict[key]['avg_x'].append(avg_x)
    for key in correlation_dict:
        series_non_x = pd.Series(correlation_dict[key]['avg_non_x'])
        series_x = pd.Series(correlation_dict[key]['avg_x'])
        correlation = series_non_x.corr(series_x)
        correlation_dict[key] = correlation
    return correlation_dict

# Function to calculate general correlation across all models and temperatures for each prompt class
def calculate_general_correlation_across_all_models_and_temps(averages_dict):
    correlation_dict = {}
    for key in averages_dict:
        prompt_class = key[0]
        avg_non_x = averages_dict[key]['avg_non_x']
        avg_x = averages_dict[key]['avg_x']
        if prompt_class not in correlation_dict:
            correlation_dict[prompt_class] = {'avg_non_x': [], 'avg_x': []}
        correlation_dict[prompt_class]['avg_non_x'].append(avg_non_x)
        correlation_dict[prompt_class]['avg_x'].append(avg_x)
    for prompt_class in correlation_dict:
        series_non_x = pd.Series(correlation_dict[prompt_class]['avg_non_x'])
        series_x = pd.Series(correlation_dict[prompt_class]['avg_x'])
        correlation = series_non_x.corr(series_x)
        correlation_dict[prompt_class] = correlation
    return correlation_dict

# Function to calculate general correlation across all prompt classes, models, and temperatures
def calculate_general_correlation_across_prompt_classes_models_and_temps(averages_dict):
    all_non_x_values = []
    all_x_values = []
    for key in averages_dict:
        all_non_x_values.append(averages_dict[key]['avg_non_x'])
        all_x_values.append(averages_dict[key]['avg_x'])        
    general_correlation = pd.Series(all_non_x_values).corr(pd.Series(all_x_values))
    return general_correlation


# Perform the analysis

filtered_df = df[df['prompt_class'].isin(prompt_classes_for_correlations)]

averages_dict_by_temp = calculate_averages_by_temp(filtered_df)
correlation_across_models_by_temp = calculate_correlation_across_models_by_temp(averages_dict_by_temp)
general_correlation_across_all_models_and_temps = calculate_general_correlation_across_all_models_and_temps(averages_dict_by_temp)
general_correlation_across_prompt_classes_models_and_temps = calculate_general_correlation_across_prompt_classes_models_and_temps(averages_dict_by_temp)

# Print the results
print("Correlation Across Models by Temperature:")
for key, value in correlation_across_models_by_temp.items():
    print(f"{key}: {value}")
print("\nGeneral Correlation Across All Models and Temperatures:")
for key, value in general_correlation_across_all_models_and_temps.items():
    print(f"{key}: {value}")
print("\nGeneral Correlation Across All Prompt Classes, Models, and Temperatures:")
print(general_correlation_across_prompt_classes_models_and_temps)


In [None]:
# Extract unique temperatures and prompt classes
unique_temperatures = sorted(set(temp for _, temp in correlation_across_models_by_temp.keys()))
unique_prompt_classes = sorted(set(prompt_class for prompt_class, _ in correlation_across_models_by_temp.keys()))

# Convert the dictionary to a format suitable for a DataFrame
data_for_heatmap = {temp: {prompt_class: correlation_across_models_by_temp.get((prompt_class, temp), None)
                           for prompt_class in unique_prompt_classes}
                    for temp in unique_temperatures}

# Convert to DataFrame and drop rows with all NaN values
correlation_df = pd.DataFrame(data_for_heatmap).dropna(how='all')

# Create the heatmap
plt.figure(figsize=(8, 4))

# Use rc_context to set font size locally
with plt.rc_context({'font.size': 12}):
    sns.heatmap(correlation_df, annot=True, cmap='RdYlGn', center=0, fmt=".2f", cbar=False)
    plt.title("Correlations between frequencies of affirmative response \nto prompts with sensical vs. nonsensical predicates")
    plt.ylabel("Prompt Class")
    plt.xlabel("Temperature")

    # Save the plot as a PDF file with a tight layout
    plt.savefig('results/correlations.pdf', bbox_inches='tight')

#plt.close()
plt.show()

## Figures with green bars: average frequency of correct answer

In [None]:
for temp in temperatures:

    # Filter the dataframe for selected temperature
    df_filtered = df[df['temperature'] == temp]

    unique_prompt_classes = df_filtered['prompt_class'].unique()

    model_order = ["GPT-4 (0613)", "GPT-3.5 Turbo (0613)", "Llama 2 Chat 70B", "Mixtral 8x7B",
                "Yi Chat 34B", "Code Llama 34B", "Code Llama 13B",
                "Llama 2 Chat 13B", "Mistral 7B", "Code Llama 7B",
                "Llama 2 Chat 7B", "Phi-2"]
    
    model_shortened_names = {name:name for name in model_order}
    model_shortened_names["GPT-4 (0613)"] = "GPT-4"
    model_shortened_names["GPT-3.5 Turbo (0613)"] = "GPT-3.5 Turbo"

    # Colors from the seaborn "bright" palette
    bright_palette = sns.color_palette("bright")
    bright_color = bright_palette[2] # Choosing a color for the correct answer

    # Dictionary to store file paths for each prompt's graph
    percent_graph_paths = {}

    for prompt_class in tqdm(unique_prompt_classes):
        # Filtering for the current prompt class
        prompt_class_df = df_filtered[df_filtered['prompt_class'] == prompt_class].copy()
        prompt_class_df['model'] = pd.Categorical(prompt_class_df['model'], categories=model_order, ordered=True)
        prompt_class_df['model'] = prompt_class_df['model'].replace(model_shortened_names)

        # Select the correct percentage based on the correct answer
        correct_percent_column = 'yes_percent' if correct_answers[prompt_class] == 'yes' else 'no_percent'
        prompt_class_df[correct_percent_column] = abs(prompt_class_df[correct_percent_column]) * 100  # Converting to positive percentage

        # Creating a multi-line title
        prompt = prompt_class +  " example: " + prompt_class_df['prompt'].iloc[0]
        words = prompt.split()
        title_lines = []
        line = ""
        for word in words:
            if len(line) + len(word) <= 55:
                line += word + " "
            else:
                title_lines.append(line.strip())
                line = word + " "
        title_lines.append(line.strip()) # Append the last line
        title = "\n".join(title_lines)

        # Creating the plot
        plt.figure(figsize=(9, 6))
        ax = plt.gca()
        plt.title(title, fontsize=16)

        # Plotting bars
        sns.barplot(x=correct_percent_column, y="model", data=prompt_class_df, color=bright_color, edgecolor='black')

        # Extending the x-axis range
        plt.xlabel(f'Correct answer (\'{correct_answers[prompt_class]}\') frequency', fontsize='16')
        plt.ylabel('')
        plt.xlim(0, 105) # Extending x-axis to 110% for visual clarity

        # Adjusting the x-axis ticks and labels
        ax.set_xticks([0, 25, 50, 75, 100])
        ax.set_xticklabels(['0%', '25%', '50%', '75%', '100%'])

        plt.yticks(fontsize='16') 
        plt.xticks(fontsize='16') 

        ax.legend([], [], frameon=False)
        plt.tight_layout()

        # Save the plot
        output_file = f'graphs/correct_freq_for_{prompt_class}_temp_{temp}.pdf'
        plt.savefig(output_file, format='pdf')
        plt.close()

        # Updating the dictionary with the new file paths
        percent_graph_paths[prompt_class] = output_file

    # Output the paths of the saved graphs with extended x-axis
    print(percent_graph_paths)

## Figures with orange bars: average frequency of 'no' and average frequency of 'yes'

In [None]:
for temp in temperatures:

    # Filter the dataframe for the selected temperature
    df_filtered = df[df['temperature'] == temp]

    unique_prompt_classes = df_filtered['prompt_class'].unique()

    model_order = ["GPT-4 (0613)", "GPT-3.5 Turbo (0613)", "Llama 2 Chat 70B", "Mixtral 8x7B",
                "Yi Chat 34B", "Code Llama 34B", "Code Llama 13B",
                "Llama 2 Chat 13B", "Mistral 7B", "Code Llama 7B",
                "Llama 2 Chat 7B", "Phi-2"]
    
    model_shortened_names = {name:name for name in model_order}
    model_shortened_names["GPT-4 (0613)"] = "GPT-4"
    model_shortened_names["GPT-3.5 Turbo (0613)"] = "GPT-3.5 Turbo"

    # Colors from the seaborn "bright" palette
    bright_palette = sns.color_palette("bright")
    bright_orange = bright_palette[1]

    # Dictionary to store file paths for each prompt's graph
    percent_graph_paths = {}

    for prompt_class in tqdm(unique_prompt_classes):
        # Filtering for the current prompt class
        prompt_class_df = df_filtered[df_filtered['prompt_class'] == prompt_class].copy()
        prompt_class_df['model'] = pd.Categorical(prompt_class_df['model'], categories=model_order, ordered=True)
        prompt_class_df['model'] = prompt_class_df['model'].replace(model_shortened_names)

        # Converting frequencies to percentages
        prompt_class_df['yes_percent'] *= 100
        prompt_class_df['no_percent'] *= 100

        # Creating a multi-line title
        prompt = prompt_class + " example: " + prompt_class_df['prompt'].iloc[0]
        words = prompt.split()
        title_lines = []
        line = ""
        for word in words:
            if len(line) + len(word) <= 60:
                line += word + " "
            else:
                title_lines.append(line.strip())
                line = word + " "
        title_lines.append(line.strip())  # Append the last line
        title = "\n".join(title_lines)

        # Creating the plot
        plt.figure(figsize=(8.5, 6))
        ax = plt.gca()
        plt.title(title, fontsize=16)

        # Plotting bars
        sns.barplot(x="yes_percent", y="model", data=prompt_class_df, color=bright_orange, edgecolor='black')
        sns.barplot(x="no_percent", y="model", data=prompt_class_df, color=bright_orange, edgecolor='black')

        plt.xlabel('')
        plt.ylabel('')
        plt.xlim(-105, 105)
        plt.axvline(x=0, color='black', linewidth=1)

        # Adding text
        font_size = plt.rcParams['axes.labelsize']
        ax.text(50, -0.07, "'Yes' Frequency", ha="center", va="top", transform=ax.get_xaxis_transform(), fontsize=16)
        ax.text(-50, -0.07, "'No' Frequency", ha="center", va="top", transform=ax.get_xaxis_transform(), fontsize=16)

        ax.set_xticks([-100, -50, 0, 50, 100])
        ax.set_xticklabels(['100%', '50%', '0', '50%', '100%'])

        ax.legend([], [], frameon=False)
        plt.tight_layout()

        # Adjust subplots if necessary (modify these values as needed)
        plt.subplots_adjust(top=0.9, bottom=0.1, left=0.1, right=0.9, hspace=0.2, wspace=0.2)

        plt.yticks(fontsize='16') 
        plt.xticks(fontsize='16') 

        # Save the plot
        output_file = f'graphs/percent_graph_for_{prompt_class}_temp_{temp}.pdf'
        plt.savefig(output_file, format='pdf', bbox_inches='tight')
        plt.close()

        # Updating the dictionary with the new file paths
        percent_graph_paths[prompt_class] = output_file

        #plt.show()

    # Output the paths of the saved graphs
    percent_graph_paths

# 5. Further analysis mentioned in text

In [None]:
def yes_correlation(prompt_class1,prompt_class2):

    prompt_class1_freq = []
    prompt_class2_freq = []

    for model in df['model'].unique():
        for temp in df['temperature'].unique():
            prompt_class1_mean = df[(df['prompt_class'] == prompt_class1) & (df['model'] == model) & (df['temperature'] == temp)]['yes_percent'].mean()
            prompt_class2_mean = df[(df['prompt_class'] == prompt_class2) & (df['model'] == model) & (df['temperature'] == temp)]['yes_percent'].mean() 

            if np.isnan(prompt_class1_mean):
                print(df[(df['prompt_class'] == prompt_class1) & (df['model'] == model) & (df['temperature'] == temp)]['yes_percent'])

            prompt_class1_freq.append(prompt_class1_mean)
            prompt_class2_freq.append(prompt_class2_mean)

    correlation = pd.Series(prompt_class1_freq).corr(pd.Series(prompt_class2_freq))

    return correlation

In [None]:
# As mentioned in the text, the correlation is very high between responses to 'AC' and responses to variants
# that replace 'infer' with 'conclude', 'deduce', 'logically infer', 'logically conclude', etc.

for prompt in ['cAC', 'dAC', 'gAC', 'gnAC', 'LAC', 'LcAC', 'LdAC', 'sAC', 'snAC']:

    print(f"Correlation between AC and {prompt}: {yes_correlation('AC',prompt)}")

In [None]:
# As mentioned in the text, the correlation is somewhat lower between the 'v' versions and regular versions of prompts:

base_prompts = ['MP', 'MT', 'AC', 'DA', 'DS','MTmi', 'MTmu', 'DSmi', 'DSmu']

for prompt in base_prompts:
    print(f"Correlation between {prompt} and {prompt}x: {yes_correlation(prompt,prompt+'x')}")
    print(f"Correlation between {prompt} and v{prompt}: {yes_correlation(prompt,'v'+prompt)}")
    print("")