<a href="https://colab.research.google.com/github/thaopham03/evaluating_fan_effects_in_large_language_models/blob/main/Anderson_Experimental_Design.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
import math
import os

In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Load the CSV files
people = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/People - Sheet1.csv')['People']
place = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/Place - Sheet1 (1).csv')['Place']
output_dir = '/content/drive/MyDrive/experiments/shared'
probabilities_path = os.path.join(output_dir, 'probabilities.csv')
prompts_path = os.path.join(output_dir, 'prompts.csv')
# Ensure the output directory exists
os.makedirs(output_dir, exist_ok=True)



# Function to compute softmax values
def softmax(values):
    exp_values = np.exp(values - np.max(values))
    return exp_values / exp_values.sum()

# Function to generate Gaussian and softmax values for people and places
def generate_probabilities(people, places):
    people_gauss = {person: random.gauss(0, 1) for person in people}
    place_gauss = {place: random.gauss(0, 1) for place in places}
    people_softmax = softmax(np.array(list(people_gauss.values())))
    place_softmax = softmax(np.array(list(place_gauss.values())))
    people_probs = dict(zip(people, people_softmax))
    place_probs = dict(zip(places, place_softmax))
    return people_probs, place_probs



In [4]:
# Function to generate objects set
def generate_objects(people, places, people_probs, place_probs, max_sentences, randomized):
    objects = set()
    if randomized:
        while len(objects) < max_sentences:
            person = random.choices(list(people_probs.keys()), weights=list(people_probs.values()))[0]
            place = random.choices(list(place_probs.keys()), weights=list(place_probs.values()))[0]
            objects.add((person, place))
    else:
        lowercase_letters = 'abcdefghijklmnor'
        uppercase_letters = 'ABCDEFGHIJKLMNOR'
        people_dict = {lowercase_letters[i]: people[i] for i in range(len(lowercase_letters))}
        places_dict = {uppercase_letters[i]: places[i] for i in range(len(uppercase_letters))}
        predefined_pairs = {
            ('a', 'A'), ('b', 'B'), ('c', 'C'), ('d', 'D'), ('e', 'E'), ('f', 'F'), ('g', 'G'), ('h', 'H'),
            ('i', 'I'), ('j', 'J'), ('k', 'K'), ('l', 'L'), ('e', 'K'), ('r', 'R'), ('g', 'J'), ('h', 'R'),
            ('i', 'L'), ('m', 'M'), ('n', 'N'), ('o', 'O'), ('d', 'M'), ('r', 'N'), ('f', 'O'), ('g', 'M'),
            ('h', 'N'), ('i', 'O')
        }
        objects = {(people_dict[p], places_dict[l]) for p, l in predefined_pairs}
    return objects



# Function to generate prompts
def generate_prompts(objects):
    prefix = "Following is a list of sentences"
    prompt = prefix + '\n'.join(["a {} is in the {}".format(obj[0], obj[1]) for obj in objects])
    return prompt


# Function to print objects and prompts
def print_objects_and_prompts(objects, prompt):
    print("Objects:", objects)
    print("Prompts:", prompt)





In [5]:
# Function to calculate fan value
def fan(x, objects, index=None):
    if index is not None:
        return sum(1 for obj in objects if obj[index] == x) / len(objects)
    else:
        return sum(1 for obj in objects if x in obj) / len(objects)


# Function to print fan values
def print_fan_values(objects):
    for obj in objects:
        person, plc = obj
        fan_person = fan(person, objects)
        fan_place = fan(plc, objects)
        print(f'The sentence "a {person} is in the {plc}" has fan values: person: {fan_person}, place: {fan_place}')

In [6]:
# Function to generate final data and save to CSV
def generate_and_save_data(people, places, objects, prompt, path, randomized):
    data = []
    for person in people:
        for place in places:
            fan_person = fan(person, objects, 0 if not randomized else None)
            fan_place = fan(place, objects, 1 if not randomized else None)
            dependent_variable = "present"
            true_category = "present" if (person, place) in objects else "absent"
            # Combine prompt and preamble
            prompt_key = f"a {person} is in the {place}"
            query = f'Sentence 1 is \"{prompt_key}\". In the previous list, Sentence 1 is '
            final_sentence = f"{prompt}\n{query}"
            data.append([final_sentence, person, fan_person, place, fan_place, dependent_variable, true_category])
    new_prompts_df = pd.DataFrame(data, columns=['Preamble', 'Person', 'Fan_Person', 'Place', 'Fan_Place', 'Dependent_Variable', 'True_Category'])
    new_prompts_df.to_csv(path, index=False, sep = '|' )
    print("New Prompts Data:")
    print(new_prompts_df)



In [7]:
# Main function to run experiments
def run_experiment(people_path, places_path, output_dir, max_sentences=50, randomized=True):
    people = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/People - Sheet1.csv')['People']
    places = pd.read_csv('/content/drive/MyDrive/llm_research/Experiments/shared/Place - Sheet1 (1).csv')['Place']
    output_dir = '/content/drive/MyDrive/experiments/shared'
    probabilities_path = os.path.join(output_dir, 'probabilities.csv')
    prompts_path = os.path.join(output_dir, 'prompts.csv')
    if randomized:
        people_probs, place_probs = generate_probabilities(people, places)
    else:
        people_probs, place_probs = None, None
    objects = generate_objects(people, places, people_probs, place_probs, max_sentences, randomized)
    prompt = generate_prompts(objects)
    print_objects_and_prompts(objects, prompt)
    print_fan_values(objects)  # Print fan values
    new_prompts_path = os.path.join(output_dir, 'new_file_randomized.csv' if randomized else 'new_file_not_random.csv')
    generate_and_save_data(people, places, objects, prompt, new_prompts_path, randomized)

# Parameters
people_path = '/content/drive/MyDrive/llm_research/Experiments/shared/People - Sheet1.csv'
places_path = '/content/drive/MyDrive/llm_research/Experiments/shared/Place - Sheet1 (1).csv'
output_dir = '/content/drive/MyDrive/experiments/shared'
os.makedirs(output_dir, exist_ok=True)

# Run both experiments
run_experiment(people_path, places_path, output_dir, randomized=True)
run_experiment(people_path, places_path, output_dir, randomized=False)

Objects: {('Librarian', 'Bank'), ('Electrician', 'Bank'), ('Accountant', 'Clinic'), ('Software Developer', 'Bank'), ('Architect', 'School'), ('Journalist', 'Agency'), ('Software Developer', 'Clinic'), ('Financial Analyst', 'Studio'), ('Teacher', 'Library'), ('Software Developer', 'University'), ('Marketing Manager', 'Bank'), ('Chef', 'Agency'), ('Graphic Designer', 'Library'), ('Librarian', 'Agency'), ('Electrician', 'Agency'), ('Accountant', 'Agency'), ('Librarian', 'Studio'), ('Electrician', 'Studio'), ('Clerk', 'Construction Site'), ('Software Developer', 'Agency'), ('Software Developer', 'Studio'), ('Teacher', 'Pharmacy'), ('Architect', 'Bank'), ('Marketing Manager', 'Agency'), ('Teacher', 'Restaurant'), ('Journalist', 'Library'), ('Chef', 'Library'), ('Financial Analyst', 'Clinic'), ('Electrician', 'Factory'), ('Teacher', 'Construction Site'), ('Librarian', 'Library'), ('Architect', 'Agency'), ('Clerk', 'Agency'), ('Graphic Designer', 'Construction Site'), ('Teacher', 'Bank'), ('S