<a href="https://colab.research.google.com/github/thaopham03/evaluating_fan_effects_in_large_language_models/blob/main/regenerate_anderson_prompt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import random
import os

In [None]:

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:



#inst_preamble = "Following is a list called List x. List x starts with ### and ends with $$$.After the list, a person will be judged as being present or absent in a place according to list x if list x says that person A is in place B,it should be said that in place B, person A is present...###{list x} $$$"


#fact_sep = '. '
#fact_list = ['A {} is in the {}']
#fact_sep = '. '
#fact_format = 'A {} is in the {}'
#fact_order =
#query_format = 'According to list x, in the {}, a {} is'
#query_order = 'According to list x, in the {place}, a {person} is'
#querty_order = query_format.format(*[obj[i] for i in query_order])
     #I'm assuming obj is our objects as in person and place, if query_format is the pattern to write it and query_order, how to write it and gouned to multiple
# if fact_list is a set of fact_strings joined together. What does it mean like so many a - is in- joined together and how does the formatting stuff plays a role here
#so that mean the query_format is the sentence with the blank as in how it is formatted and query_order, the same format but with the data inside it and all versions joined together
# so fact_format will now iterate all the persons and place into our format that we have joined together i.e factformat(*[obj[i]] for i in fact_order)
#instruction for later: understand what the list instruction does to build on that
target_stimuli = ['present', 'absent']

In [None]:
# Load the CSV files
people = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/People - Sheet1.csv')['People']
place = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/Place - Sheet1 (1).csv')['Place']
output_dir = '/content/drive/MyDrive/experiments/shared'
probabilities_path = os.path.join(output_dir, 'probabilities.csv')
prompts_path = os.path.join(output_dir, 'prompts.csv')
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)



# Function to compute softmax values
def softmax(values):
    exp_values = np.exp(values - np.max(values))
    return exp_values / exp_values.sum()

# Function to generate Gaussian and softmax values for people and places
def generate_probabilities(people, places):
    people_gauss = {person: random.gauss(0, 1) for person in people}
    place_gauss = {place: random.gauss(0, 1) for place in places}
    people_softmax = softmax(np.array(list(people_gauss.values())))
    place_softmax = softmax(np.array(list(place_gauss.values())))
    people_probs = dict(zip(people, people_softmax))
    place_probs = dict(zip(places, place_softmax))
    return people_probs, place_probs

In [None]:
# Function to generate objects set
def generate_objects(people, places, people_probs, place_probs, max_sentences, randomized):
    objects = set()
    if randomized:
        while len(objects) < max_sentences:
            person = random.choices(list(people_probs.keys()), weights=list(people_probs.values()))[0]
            place = random.choices(list(place_probs.keys()), weights=list(place_probs.values()))[0]
            objects.add((person, place))
    else:
        lowercase_letters = 'abcdefghijklmnor'
        uppercase_letters = 'ABCDEFGHIJKLMNOR'
        people_dict = {lowercase_letters[i]: people[i] for i in range(len(lowercase_letters))}
        places_dict = {uppercase_letters[i]: places[i] for i in range(len(uppercase_letters))}
        predefined_pairs = {
            ('a', 'A'), ('b', 'B'), ('c', 'C'), ('d', 'D'), ('e', 'E'), ('f', 'F'), ('g', 'G'), ('h', 'H'),
            ('i', 'I'), ('j', 'J'), ('k', 'K'), ('l', 'L'), ('e', 'K'), ('r', 'R'), ('g', 'J'), ('h', 'R'),
            ('i', 'L'), ('m', 'M'), ('n', 'N'), ('o', 'O'), ('d', 'M'), ('r', 'N'), ('f', 'O'), ('g', 'M'),
            ('h', 'N'), ('i', 'O')
        }
        objects = {(people_dict[p], places_dict[l]) for p, l in predefined_pairs}
    return objects



# Function to generate prompt components
def generate_prompt_components(objects, inst_preamble, fact_format, fact_order, fact_sep, query_format, query_order, query_sep):
    fact_list = fact_sep.join([fact_format.format(*[obj[i] for i in fact_order]) for obj in objects])
    query = query_sep.join([query_format.format(*[obj[i] for i in query_order]) for obj in objects])
    prompt = f"{inst_preamble}{fact_list}{query}"
    return prompt

# Function to print objects and prompts
def print_objects_and_prompts(objects, prompt):
    print("Objects:", objects)
    print("Prompts:", prompt)

In [None]:
# Function to calculate fan value
def fan(x, objects, index=None):
    if index is not None:
        return sum(1 for obj in objects if obj[index] == x) / len(objects)
    else:
        return sum(1 for obj in objects if x in obj) / len(objects)


# Function to print fan values
def print_fan_values(objects, fact_format):
    for obj in objects:
        person, plc = obj
        fan_person = fan(person, objects)
        fan_place = fan(plc, objects)
        print('The sentence "{}" has fan values: person: {}, place: {}'.format(fact_format.format(person,plc), fan_person, fan_place))


In [None]:
# Function to generate final data and save to CSV
def generate_and_save_data(people, places, objects, prompt, path, randomized, fact_sep, fact_format, include_all_cases):
    data = []

    base_prompt = f'{prompt}{fact_sep}' + f'Sentence 1 is "{fact_format}". In the previous list, Sentence 1 is '
    #instead {istruction}{fact_list}{query}. fact_list: person and place grouped together, query: senetnce that does if this is that

    target_stimuli = ['present', 'absent']
    for stim in target_stimuli:
        for person in people:
            for place in places:
              if include_all_cases or (stim == "present" and (person, place) in objects) or (stim == "absent" or (person, place) not in objects):
                fan_person = fan(person, objects, 0 if not randomized else None)
                fan_place = fan(place, objects, 1 if not randomized else None)
                true_category = "present" if (person, place) in objects else "absent"
                # Combine prompt and preamble
                final_sentence = base_prompt.format(person, place)
                data.append([final_sentence, person, fan_person, place, fan_place, stim, true_category])
    new_prompts_df = pd.DataFrame(data, columns=['Preamble', 'Person', 'Fan_Person', 'Place', 'Fan_Place', 'Dependent_Variable', 'True_Category'])
    new_prompts_df.to_csv(path, index=False )
    print("New Prompts Data:")
    print(new_prompts_df)

In [None]:
# Regenerating Prompts
def run_experiment(people_path, places_path, output_dir, max_sentences=50, randomized=True, inst_preamble="Following is a list called List x. List x starts with ### and ends with $$$. ",
                   fact_format="A {} is in the {}",
                   fact_order=[0, 1],
                   fact_sep=". ",
                   query_format="According to list x, in the {}, a {} is",
                   query_order=[1, 0],
                   query_sep=". ", include_all_cases = True):
    people = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/People - Sheet1.csv')['People']
    places = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/Place - Sheet1 (1).csv')['Place']
    output_dir = '/content/drive/MyDrive/experiments/shared'
    probabilities_path = os.path.join(output_dir, 'probabilities.csv')
    prompts_path = os.path.join(output_dir, 'prompts.csv')
    if randomized:
        people_probs, place_probs = generate_probabilities(people, places)
    else:
        people_probs, place_probs = None, None
    objects = generate_objects(people, places, people_probs, place_probs, max_sentences, randomized)

    prompt = generate_prompt_components(objects, inst_preamble, fact_format, fact_order, fact_sep, query_format, query_order, query_sep)


    print_objects_and_prompts(objects, prompt)
    print_fan_values(objects, fact_format)  # Print fan values
    new_prompts_path = os.path.join(output_dir, 'random.csv' if randomized else 'anderson_true.csv')
    generate_and_save_data(people, places, objects, prompt, new_prompts_path, randomized, fact_format, fact_sep, include_all_cases)

In [None]:
# Parameters
os.makedirs(output_dir, exist_ok=True)

people = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/People - Sheet1.csv')['People']
places = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/Place - Sheet1 (1).csv')['Place']

# Run both experiments with the same user-defined components
user_defined_components = {
    'inst_preamble': "Following is a list called List x. List x starts with ### and ends with $$$. ",
    'fact_format': "A {} is in the {}",
    'fact_order': [0, 1],
    'fact_sep': ". ",
    'query_format': "According to list x, in the {}, a {} is",
    'query_order': [1, 0],
    'query_sep': ". "
}

run_experiment(people, places, output_dir, randomized=True, **user_defined_components, include_all_cases=True)
run_experiment(people, places, output_dir, randomized=False, **user_defined_components, include_all_cases= False)

run_experiment(people, places, output_dir, randomized=True, **user_defined_components, include_all_cases= True)
run_experiment(people, places, output_dir, randomized=False, **user_defined_components, include_all_cases= False )

Objects: {('Marketing Manager', 'Courthouse'), ('Police Officer', 'Clinic'), ('Police Officer', 'Church'), ('Police Officer', 'Office Building'), ('Teacher', 'University'), ('Lawyer', 'Church'), ('Marketing Manager', 'Office Building'), ('Engineer', 'Studio'), ('Marketing Manager', 'Church'), ('Engineer', 'Hospital'), ('Clerk', 'Church'), ('Marketing Manager', 'Restaurant'), ('Doctor', 'University'), ('Financial Analyst', 'Park'), ('Financial Analyst', 'Police Station'), ('Pastor', 'Clinic'), ('Nurse', 'Studio'), ('Accountant', 'Construction Site'), ('Police Officer', 'Studio'), ('Engineer', 'University'), ('Electrician', 'University'), ('Pastor', 'Church'), ('Police Officer', 'Construction Site'), ('Marketing Manager', 'Studio'), ('Marketing Manager', 'Construction Site'), ('Clerk', 'Studio'), ('Marketing Manager', 'Hospital'), ('Police Officer', 'Park'), ('Clerk', 'Construction Site'), ('Journalist', 'Church'), ('Clerk', 'Hospital'), ('Police Officer', 'Police Station'), ('Psychologi