### Information Statements

In [27]:
import pandas as pd
import csv

languages = ['english', 'dutch']
types = ['explicit', 'implicit']

backgrounds = ['[MASK]','Dutch', 'Moroccan', 'Turkish', 'European-American', 'African-American', 'Mexican']
genders = ['[MASK]', 'Male', 'Female']
ages = ['[MASK]', '25', '40', '65']

eng_decision_mitgation = ['eng_decision_default', 'eng_decision_level_1', 'eng_decision_level_1B', 'eng_decision_level_2', 
                          'eng_decision_level_3', 'eng_decision_level_4', 'eng_decision_level_5']
nl_decision_mitigation = ['nl_decision_default', 'nl_decision_level_1', 'eng_decision_level_1B', 'nl_decision_level_2',
                            'nl_decision_level_3', 'nl_decision_level_4', 'nl_decision_level_5']
eng_summary_mitigation = ['eng_summarizing_default', 'eng_summarizing_extractive', 'eng_summarizing_level_1', 'eng_summarizing_level_2',
                            'eng_summarizing_level_3', 'eng_summarizing_level_4', 'eng_summarizing_level_5']
nl_summary_mitigation = ['nl_summarizing_default', 'nl_summarizing_extractive', 'nl_summarizing_level_1', 'nl_summarizing_level_2',
                            'nl_summarizing_level_3', 'nl_summarizing_level_4', 'nl_summarizing_level_5']

combinations = len(genders) * len(backgrounds) * len(ages)

# Read in the decision id's
df = pd.read_json('../data_input/unfilled_explicit_english.jsonl', lines=True)
question_ids = df['decision_question_id'].unique()

print(question_ids)

[ 0  1  3  4  5  7  8  9 10 11 12 13 14 15 16 18 19 21 23 24 26 27 28 29
 30 31 33 34 36 37 40 41 42 43 44 47 48 52 54 55 56 59 60 61 62 64 65 66
 67 70 71 72 73 74 75 76 77 78 80 81 82 83 84 87 89 90 92 93 94 95]


### Decision Task - Explicit 

In [21]:
language = 'dutch'
question_limit = 120
type_background = 'explicit'
task = 'decision'

if language == 'english':
    decision_prompts = eng_decision_mitgation
elif language == 'dutch':
    decision_prompts = nl_decision_mitigation

with open(f'{language}_{type_background}_setup.csv', 'w', newline='') as csvfile:
    fieldnames = ['run', 'llm', 'task', 'language', 'type_background', 'group_id','prompt_mitigation', 'question_id','background', 'gender','age',
                    'yes_prob', 'no_prob', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for question_id in question_ids:
        # break after first 5 question_ids
        if question_id >= question_limit:
            continue
        for mitigation in decision_prompts:
            group_id = 0
            for background in backgrounds:
                for gender in genders:
                    for age in ages:
                        group_id += 1

                        writer.writerow({'run': '-',
                                        'llm': 'NaN', 
                                        'task': task,
                                        'language': language,
                                        'type_background': type_background,
                                        'group_id': group_id,
                                        'prompt_mitigation': mitigation,
                                        'question_id': question_id,
                                        'background': background, 
                                        'gender': gender, 
                                        'age': age,
                                        'yes_prob': 'NaN',
                                        'no_prob': 'NaN',
                                        'top_1': 'NaN',
                                        'top_2': 'NaN',
                                        'top_3': 'NaN',
                                        'top_4': 'NaN',
                                        'top_5': 'NaN'
                                        })

### Decision Task - Implicit

In [24]:
language = 'dutch'
question_limit = 120
type_background = 'implicit'
task = 'decision'

if language == 'english':
    decision_prompts = eng_decision_mitgation
elif language == 'dutch':
    decision_prompts = nl_decision_mitigation

df = pd.read_json('../data_input/names.json')

with open(f'{language}_{type_background}_setup.csv', 'w', newline='') as csvfile:
    fieldnames = ['run', 'llm', 'task', 'language', 'group_id', 'type_background', 'first_name', 'surname', 'prompt_mitigation','question_id','background', 'gender','age',
                    'yes_prob', 'no_prob', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for question_id in question_ids:
        # break after first 5 question_ids
        if question_id >= question_limit:
            continue
        for mitigation in decision_prompts:
            group_id = 0
            for background in backgrounds:
                for gender in genders:
                    for age in ages:
                        group_id += 1
                        
                        first_names = df.loc[(df['background'] == background) & (df['gender'] == gender), 'first_names'].explode().tolist()
                        surnames = df.loc[(df['background'] == background) & (df['gender'] == gender), 'surnames'].explode().tolist()
                        # loop over first_names and surnames at the same time
                        for first_name, surname in zip(first_names, surnames):
                            writer.writerow({'run': '-',
                                            'llm': 'NaN', 
                                            'task': task,
                                            'language': language,
                                            'group_id': group_id,
                                            'type_background': type_background,
                                            'first_name': first_name,
                                            'surname': surname,
                                            'prompt_mitigation': mitigation,
                                            'question_id': question_id,
                                            'background': background, 
                                            'gender': gender, 
                                            'age': age,
                                            'yes_prob': 'NaN',
                                            'no_prob': 'NaN',
                                            'top_1': 'NaN',
                                            'top_2': 'NaN',
                                            'top_3': 'NaN',
                                            'top_4': 'NaN',
                                            'top_5': 'NaN',
                                            })

### Summary Task - Explicit

In [29]:
language = 'english'
question_limit = 120
summary_iterations = 2

type_background = 'explicit'
task = 'summary'

if language == 'english':
    summary_prompts = eng_summary_mitigation
elif language == 'dutch':
    summary_prompts = nl_summary_mitigation

with open(f'summary_{language}_{type_background}_setup.csv', 'w', newline='') as csvfile:
    fieldnames = ['run', 'run_summary','llm', 'task', 'language', 'type_background', 'group_id','prompt_mitigation', 'question_id','background', 'gender','age',
                    'yes_prob', 'no_prob', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'summary_iteration', 'summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for question_id in question_ids:
        # break after first 5 question_ids
        if question_id >= question_limit:
            continue
        for mitigation in summary_prompts:
            group_id = 0
            for background in backgrounds:
                for gender in genders:
                    for age in ages:
                        group_id += 1
                        for summary_iteration in range(summary_iterations):
                            writer.writerow({'run': '-',
                                             'run_summary': '-',
                                            'llm': 'NaN', 
                                            'task': task,
                                            'language': language,
                                            'type_background': type_background,
                                            'group_id': group_id,
                                            'prompt_mitigation': mitigation,
                                            'question_id': question_id,
                                            'background': background, 
                                            'gender': gender, 
                                            'age': age,
                                            'yes_prob': 'NaN',
                                            'no_prob': 'NaN',
                                            'top_1': 'NaN',
                                            'top_2': 'NaN',
                                            'top_3': 'NaN',
                                            'top_4': 'NaN',
                                            'top_5': 'NaN',
                                            'summary_iteration': 'NaN',
                                            'summary': 'NaN'
                                            })

### Summary Task - Implicit

In [25]:
language = 'dutch'
question_limit = 120
summary_iterations = 2

type_background = 'implicit'
task = 'summary'

if language == 'english':
    summary_prompts = eng_summary_mitigation
elif language == 'dutch':
    summary_prompts = nl_summary_mitigation

df = pd.read_json('../data_input/names.json')

with open(f'summary_{language}_{type_background}_setup.csv', 'w', newline='') as csvfile:
    fieldnames = ['run', 'run_summary', 'llm', 'task', 'language', 'group_id', 'type_background', 'first_name', 'surname', 'prompt_mitigation','question_id','background', 'gender','age',
                    'yes_prob', 'no_prob', 'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'summary_iteration', 'summary']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()

    for question_id in question_ids:
        # break after first 5 question_ids
        if question_id >= question_limit:
            continue
        for mitigation in summary_prompts:
            group_id = 0
            for background in backgrounds:
                for gender in genders:
                    for age in ages:
                        group_id += 1
                        for summary_iteration in range(summary_iterations):
                            first_names = df.loc[(df['background'] == background) & (df['gender'] == gender), 'first_names'].explode().tolist()
                            surnames = df.loc[(df['background'] == background) & (df['gender'] == gender), 'surnames'].explode().tolist()
                            # loop over first_names and surnames at the same time
                            for first_name, surname in zip(first_names, surnames):
                                writer.writerow({'run': '-',
                                                 'run_summary': '-',
                                                'llm': 'NaN', 
                                                'task': task,
                                                'language': language,
                                                'group_id': group_id,
                                                'type_background': type_background,
                                                'first_name': first_name,
                                                'surname': surname,
                                                'prompt_mitigation': mitigation,
                                                'question_id': question_id,
                                                'background': background, 
                                                'gender': gender, 
                                                'age': age,
                                                'yes_prob': 'NaN',
                                                'no_prob': 'NaN',
                                                'top_1': 'NaN',
                                                'top_2': 'NaN',
                                                'top_3': 'NaN',
                                                'top_4': 'NaN',
                                                'top_5': 'NaN',
                                                'summary_iteration': summary_iteration,
                                                'summary': 'NaN'
                                                })