In [59]:
from collections import defaultdict
import json
import pandas as pd

In [78]:
df = pd.read_csv("MTurk_key_value_experiment_data_fixed.csv")

In [79]:
df.shape

(38555, 26)

In [80]:
# For round 1
df = df[df.Question.str.contains(r"\b(?:upper|under)\b", regex=True)==False]
df.columns = [c.replace(" ", "_") for c in df.columns]

conditions = ['Human_Caption', 'S0_caption', 'S1_caption', 'S1-Q_caption', 'S1-QH_caption']

n_conditions = len(conditions)

conditions, samp_images = prep_data(df, n_examples_per_question=20)

In [81]:
fmt = format_condition(conditions, "pilot-01.csv")

In [82]:
with open("pilot-01-images.json", "wt") as f:
    json.dump(samp_images, f, indent=2)

In [83]:
# For round 2, exclude images used in round 1:

df = pd.read_csv("MTurk_key_value_experiment_data_fixed.csv")

with open("pilot-01-images.json") as f:    
    previous_images = json.load(f)
    
df = df[df.ImageID.isin(previous_images)==False]    

In [84]:
df.shape

(36420, 26)

In [85]:
df.Question.unique().shape

(17,)

In [86]:
df = df[df.Question.str.contains(r"\b(?:upper|under)\b", regex=True)==False]

In [87]:
df.Question.unique().shape

(15,)

In [88]:
df.columns = [c.replace(" ", "_") for c in df.columns]

In [89]:
conditions = ['Human_Caption', 'S0_caption', 'S1_caption', 'S1-Q_caption', 'S1-QH_caption']

n_conditions = len(conditions)

In [90]:
def prep_data(df, n_examples_per_question=20):
    
    # Even distribution across questions:
    samp = df.groupby('Question').apply(lambda x: x.sample(n_examples_per_question))
    
    samp = samp.sample(frac=1)
            
    data = defaultdict(list)    
    starter = 0
    for i, row in samp.iterrows():
        for j in range(n_conditions):     
            # Latin Square cycle step:
            cond = conditions[(starter+j) % n_conditions]
            task = {}
            task['ImageID'] = row['ImageID']
            task['Question'] = row['Question']
            task['Caption'] = row[cond]
            task['Condition'] = cond
            for colname in df.columns:
                if 'Answer' in colname:
                    val = "" if pd.isnull(row[colname]) else row[colname]
                    if val == "Question is discussed in caption but answer options are too different":
                        val = "The caption answers the question, but not with one of the above options"
                    elif val == "None of the above":
                        val = "The caption does not contain an answer to the question"                                        
                    task[colname] = val                
            data[j].append(task) 
        # Move the Latin Square cycle forward:
        starter += 1
    
    # Save these so that we don't reuse them if we run more experiments
    samp_images = list(samp['ImageID'].values)
    
    return data, samp_images

In [91]:
conditions, samp_images = prep_data(df, n_examples_per_question=20)

In [92]:
def format_condition(conditions, output_filename, hit_size=13):
    data = []
    for condition in conditions.values():
        for i in range(1, len(condition), hit_size+1):
            hit_dict = {}
            hit_items = condition[i: i+hit_size]        
            for item_num, item in enumerate(hit_items, start=1):
                item = {f"Item_{item_num}_{k}": v for k, v in item.items()}
                hit_dict.update(item)
            data.append(hit_dict)    
    df = pd.DataFrame(data)
    df.to_csv(output_filename, index=None)
    return df

In [93]:
fmt = format_condition(conditions, "pilot-02.csv")

In [94]:
fmt.shape

(110, 299)

In [95]:
with open("pilot-02-images.json", "wt") as f:
    json.dump(samp_images, f, indent=2)