In [13]:

import json
import pandas as pd
import numpy as np

def load_jsonl(file_path: str) -> list[list[dict]]:
    """
    Load and parse a file where each line is a JSON-encoded string representing
    a participant's response data across trials.
    """
    participants_data = []
    with open(file_path, "r") as file:
        for line in file:
            try:
                participant_data = json.loads(line.strip())
                participants_data.append(participant_data)
            except json.JSONDecodeError as e:
                print(f"Error parsing line: {e}")
    return participants_data


def retrieve_conditions(participants_data: list[list[dict]]) -> list[str]:
    """
    Retrieve the condition of each participant from the data.
    """
    conditions = []
    for participant_data in participants_data:
        condition = next(
            (
                ["Casual", "Competitive"][entry["condition"]]
                for entry in participant_data
                if entry.get("condition", None) is not None
            ),
            None,
        )
        assert condition is not None, "Condition not found for participant"
        conditions.append(condition)
    return conditions


def retrieve_subj_demographics(participants_data: list[list[dict]]) -> list[tuple]:
    """
    Returns the self-reported demographics of each participant.
    """
    full_demographics = []
    for participant_data in participants_data:
        demographics = [
            (
                entry["response"]["race"],
                entry["response"]["gender"],
                int(entry["response"]["age"]),
                entry["response"]["college"],
            )
            for entry in participant_data
            if type(entry.get("response")) is dict
            and entry["response"].get("race") is not None
        ]
        full_demographics.append(demographics[0])
    return full_demographics


def extract_race_gender_age(url):
    """
    Extract the race, gender, and age from a character's image URL.
    """
    filename = url.split("/")[-1].split(".")[0].lower()
    return filename.split("_")[2:]


def compare_race(raceA: str, raceB: str) -> bool:
    """
    Compare two races to check for a match.
    """
    codes = {
        "South Asian": 0,
        "south-asian": 0,
        "East/Southeast Asian": 1,
        "east-asian": 1,
        "Black": 2,
        "black": 2,
        "White": 3,
        "white": 3,
        "Hispanic/Latine/Latinx": 4,
        "latino": 4,
        "Indigenous": 5,
        "Multiracial": 6,
        "Other": 7,
    }
    return codes.get(raceA, np.nan) == codes.get(raceB, np.nan)


def convert_age(age_int: int) -> str:
    """
    Convert age integer into age range.
    """
    if age_int <= 24:
        return "18-24"
    elif age_int <= 31:
        return "25-31"
    elif age_int <= 38:
        return "32-38"
    elif age_int <= 45:
        return "39-45"
    return "45+"


def convert_gender(gender: str) -> str:
    """
    Convert gender into standardized categories.
    """
    genders = ["Man", "Woman", "Non-binary", "Other"]
    codes = {
        "Man": 0,
        "male": 0,
        "man": 0,
        "Woman": 1,
        "woman": 1,
        "female": 1,
        "Non-binary": 2,
        "Other": 3,
    }
    return genders[codes.get(gender)]


# Path to the dataset
jatos_data_path = "Team_Building_16_11_2024_a.jsonl"

dataset_paths = {
    'first_pass': 'first_pass.jsonl',
    'second_pass': 'second_pass.jsonl',
    'third_pass': 'third_pass.jsonl',
    'fourth_pass': 'third_pass.jsonl',
    'fifth_pass': 'fifth_pass.jsonl'
}

datasets = []

for key in dataset_paths:
    jatos_data_path = dataset_paths[key]

    # Load the data
    data = load_jsonl(jatos_data_path)
    conditions = retrieve_conditions(data)
    subject_demographics = retrieve_subj_demographics(data)

    # Dictionary to store extracted information for each decision
    merged = {
        "dataset": [],
        "subject": [],
        "condition": [],
        "subject_race": [],
        "subject_age": [],
        "subject_gender": [],
        "chosen_race": [],
        "chosen_age": [],
        "chosen_gender": [],
        "rejected_race": [],
        "rejected_age": [],
        "rejected_gender": [],
        "shared_features_chosen": [],
        "shared_features_rejected": [],

    }

    # Extract decision-level data
    for i, participant_data in enumerate(data):
        subject_id = i + 1
        condition = conditions[i]
        subject_race, subject_gender, subject_age, _ = subject_demographics[i]

        for entry in participant_data:
            if entry.get("winner") is not None:
                # Extract chosen and rejected characters
                winner_url = entry['left_image'] if entry['choice'] == 'left' else entry['right_image']
                loser_url = entry['left_image'] if entry['choice'] == 'right' else entry['right_image']
                print(entry)

                winner_demo = extract_race_gender_age(winner_url)
                loser_demo = extract_race_gender_age(loser_url)

                if not winner_demo or not loser_demo:
                    continue

                # Unpack winner and loser demographics
                chosen_race, chosen_gender, chosen_age = winner_demo
                rejected_race, rejected_gender, rejected_age = loser_demo

                # Calculate shared features
                shared_chosen = 0
                shared_chosen += compare_race(subject_race, chosen_race)
                shared_chosen += convert_age(subject_age) == convert_age(int(chosen_age))
                shared_chosen += convert_gender(subject_gender) == convert_gender(chosen_gender)

                shared_rejected = 0
                shared_rejected += compare_race(subject_race, rejected_race)
                shared_rejected += convert_age(subject_age) == convert_age(int(rejected_age))
                shared_rejected += convert_gender(subject_gender) == convert_gender(rejected_gender)

                # Append to merged data
                merged["dataset"].append(key)
                merged["subject"].append(subject_id)
                merged["condition"].append(condition)
                merged["subject_race"].append(subject_race)
                merged["subject_age"].append(convert_age(subject_age))
                merged["subject_gender"].append(convert_gender(subject_gender))
                merged["chosen_race"].append(chosen_race)
                merged["chosen_age"].append(convert_age(int(chosen_age)))
                merged["chosen_gender"].append(convert_gender(chosen_gender))
                merged["rejected_race"].append(rejected_race)
                merged["rejected_age"].append(convert_age(int(rejected_age)))
                merged["rejected_gender"].append(convert_gender(rejected_gender))
                merged["shared_features_chosen"].append(shared_chosen)
                merged["shared_features_rejected"].append(shared_rejected)

    # Convert the dictionary into a DataFrame
    partial_data = pd.DataFrame(merged)
    datasets.append(partial_data)
datasets[1].head()


{'rt': 10308, 'stimulus': '<div class="trial-container"><div class="character-container">\n            <img src="https://raw.githubusercontent.com/githubpsyche/images/main/characters/Age_39_female_south-asian/cdf_IF-737_south-asian_female_51.jpg" alt="Character south-asian female">\n            <div>\n                <p>Race: South Asian</p>\n                <p>Sex: Female</p>\n                <p>Age: 51</p>\n            </div>\n        </div><div class="character-container">\n            <img src="https://raw.githubusercontent.com/githubpsyche/images/main/characters/Age_39_male_south-asian/cdf_IM-635_south-asian_male_44.jpg" alt="Character south-asian male">\n            <div>\n                <p>Race: South Asian</p>\n                <p>Sex: Male</p>\n                <p>Age: 44</p>\n            </div>\n        </div></div>', 'response': 0, 'left_index': 3, 'right_index': 6, 'left_image': 'https://raw.githubusercontent.com/githubpsyche/images/main/characters/Age_39_female_south-asian/

Unnamed: 0,dataset,subject,condition,subject_race,subject_age,subject_gender,chosen_race,chosen_age,chosen_gender,rejected_race,rejected_age,rejected_gender,shared_features_chosen,shared_features_rejected
0,second_pass,1,Competitive,White,32-38,Man,white,32-38,Woman,south-asian,25-31,Man,2,1
1,second_pass,1,Competitive,White,32-38,Man,south-asian,18-24,Woman,latino,32-38,Man,0,2
2,second_pass,1,Competitive,White,32-38,Man,white,32-38,Woman,latino,39-45,Woman,2,0
3,second_pass,1,Competitive,White,32-38,Man,south-asian,25-31,Man,latino,32-38,Man,1,2
4,second_pass,1,Competitive,White,32-38,Man,south-asian,18-24,Woman,latino,39-45,Woman,0,0


In [8]:
data_list = []
for df in datasets:
    name = df['dataset'].iloc[0]
    df['subject'] = df['subject'].apply(lambda x: f"{name}_{x}")
    df['dataset'] = name
    data_list.append(df)

full_data = pd.concat(data_list, ignore_index=True)
full_data

Unnamed: 0,dataset,subject,condition,subject_race,subject_age,subject_gender,chosen_race,chosen_age,chosen_gender,rejected_race,rejected_age,rejected_gender,shared_features_chosen,shared_features_rejected
0,first_pass,first_pass_first_pass_first_pass_first_pass_1,Competitive,South Asian,18-24,Man,south-asian,45+,Woman,south-asian,39-45,Man,1,2
1,first_pass,first_pass_first_pass_first_pass_first_pass_1,Competitive,South Asian,18-24,Man,white,39-45,Man,latino,45+,Woman,1,0
2,first_pass,first_pass_first_pass_first_pass_first_pass_1,Competitive,South Asian,18-24,Man,black,18-24,Man,south-asian,39-45,Man,2,2
3,first_pass,first_pass_first_pass_first_pass_first_pass_1,Competitive,South Asian,18-24,Man,east-asian,18-24,Man,white,25-31,Man,2,1
4,first_pass,first_pass_first_pass_first_pass_first_pass_1,Competitive,South Asian,18-24,Man,white,39-45,Man,white,25-31,Woman,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6331,fifth_pass,fifth_pass_fifth_pass_fifth_pass_4,Competitive,White,32-38,Man,latino,18-24,Woman,east-asian,45+,Woman,0,0
6332,fifth_pass,fifth_pass_fifth_pass_fifth_pass_4,Competitive,White,32-38,Man,latino,39-45,Woman,east-asian,45+,Man,0,1
6333,fifth_pass,fifth_pass_fifth_pass_fifth_pass_4,Competitive,White,32-38,Man,latino,18-24,Man,south-asian,39-45,Woman,1,0
6334,fifth_pass,fifth_pass_fifth_pass_fifth_pass_4,Competitive,White,32-38,Man,latino,18-24,Woman,latino,39-45,Woman,0,0


In [9]:
full_data.to_csv("choice_data.csv", index=False)