This script maps the PC Ibex result files (one for each list) onto a table with subject information.
Mostly for approving / rejecting HITs

- subject_id used for filenames (4 alphabets)
- the name of zip files
- Unique Identifiers (8 alphabets/numbers)
- Worker ID/Name

*The names of the result csv files must end with the list id*

In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
# Get the result directory
# The last character/number must indicate the name of the latin-square list
result_dir = ""

# Set the output path
output_path = ""

In [None]:
# Get the path of the result files
file_list = [f for f in os.listdir(result_dir) if "results" in f]
path_list = [os.path.join(result_dir, f) for f in file_list]

In [None]:
# Get the relevant raws for all the lists / experiments

idt_list_all = [] # identifier
wid_list_all = [] # worker id
fn_list_all = [] # zipfile names
ls_list = [] # list id
subjid_list_all = [] #subject id

for p in path_list:
    with open(p) as f:
        raw = f.readlines()
    list_id = os.path.splitext(os.path.basename(p))[0][-1]
    rel1 = [i for i in raw if "exit_form" in i or "DYNAMIC" in i]
    # Get the relevant raws
    idt_list_raw = [r.split(",") for r in [i for i in rel1 if "identifier" in i]]
    idt_list_all = idt_list_all + idt_list_raw
    
    wid_list_raw = [r.split(",") for r in [i for i in rel1 if "worker_id" in i]]
    wid_list_all = wid_list_all + wid_list_raw
    
    fn_list_raw = [r.split(",") for r in [i for i in rel1 if "Filename" in i]]
    fn_list_all = fn_list_all + fn_list_raw
    
    ls_list = ls_list + [list_id] * len(fn_list_raw)
        
    subjid_list_raw = [r.split(",") for r in [l for l in raw if "exp_60" in l]]
    subjid_list_all = subjid_list_all + subjid_list_raw


In [None]:
# Make sure the time and the ip address is lined up
time_list = [idt_list_all[i][0] for i in range(0, len(idt_list_all))]
time_list1 = [wid_list_all[i][0] for i in range(0, len(wid_list_all))]
time_list2 = [fn_list_all[i][0] for i in range(0, len(fn_list_all))]

ip_list = [idt_list_all[i][1] for i in range(0, len(idt_list_all))]
ip_list1 = [wid_list_all[i][1] for i in range(0, len(wid_list_all))]
ip_list2 = [fn_list_all[i][1] for i in range(0, len(fn_list_all))]

print(time_list == time_list1 and time_list == time_list2)
print(ip_list == ip_list1 and ip_list == ip_list2)

In [None]:
# Convert times as strings to times as integers
time_list_i = [int(i) for i in time_list]

In [None]:
# Get all the info
idt_list = [idt_list_all[i][9] for i in range(0, len(idt_list_all))]
wid_list = [wid_list_all[i][9] for i in range(0, len(wid_list_all))]
fn_list = [fn_list_all[i][9] for i in range(0, len(fn_list_all))]

subjid_list = [subjid_list_all[i][11] for i in range(0, len(subjid_list_all))]
subjid_list_c = [subjid_list_all[i][7].split("_")[-1] for i in range(0, len(subjid_list_all))]

subjid_list == subjid_list_c

In [None]:
# Create a table for all the info
subj_table = pd.DataFrame({"time": time_list_i, "ip": ip_list, "identifier":idt_list, "worker_id": wid_list, "subject_id":subjid_list, "filename": fn_list, "list": ls_list})

In [None]:
subj_table

In [None]:
# See if there are multiple submissions from the same IP address
if len(set(subj_table.ip)) != len(subj_table):
    import collections
    print(collections.Counter(subj_table.ip).most_common())

else:
    print("No multiple submissions")


In [None]:
# Output the result into a csv file
#subj_table.to_csv(output_path, index = False)

### Add batch info based on the reception time

In [None]:
boundaries = [1634850000, 1636000000]

In [None]:
subj_table["batch"] = "batch2"

In [None]:
subj_table.loc[subj_table["time"] < boundaries[0], ["batch"]] = "pilot"
subj_table.loc[(subj_table["time"] > boundaries[0]) & (subj_table["time"] < boundaries[1]), ["batch"]] = "batch1"

In [None]:
# Output the result into a csv file
subj_table.to_csv(output_path, index = False)

### Use MTurk result files to add batch information and check submissions

In [None]:
amt_res_dir = ""
id_string = ""

In [None]:
amt_res_p = [os.path.join(amt_res_dir, fn) for fn in os.listdir(amt_res_dir) if id_string in fn]

In [None]:
rcolnames = ['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.url',
       'Answer.Code', 'Answer.FileName', 'Answer.Identifier', 'Answer.Name',
       'Approve', 'Reject', 'UPDATE-Already participated (WYCL)'] + ["BatchId"]

In [None]:
amt_result = None
for p in amt_res_p:
    t_amt_result = pd.read_csv(p)
    batchid = os.path.splitext(p)[0].split("_")[-1]
    t_amt_result["BatchId"] = batchid
    
    if "amt_result" in locals():
        amt_result = pd.concat([amt_result, t_amt_result])
    else:
        amt_result = t_amt_result


In [None]:
amtset = set(amt_result["WorkerId"])
pcibexset = set(subj_table["worker_id"])

In [None]:
not_in_pcibex = amtset - pcibexset
not_in_amt = pcibexset - amtset

In [None]:
not_in_pcibex

In [None]:
not_in_amt

In [None]:
subj_table[subj_table["worker_id"].isin(not_in_amt)]

In [None]:
amt_result[amt_result["WorkerId"].isin(not_in_pcibex)]

In [None]:
b2 = subj_table[subj_table["batch"] == "batch2"]

In [None]:
checklist = []
for w in subj_table.worker_id:
    
    if np.sum(amt_result["WorkerId"] == w) == 0:
        checklist.append(w)
    else:
        amt_id = amt_result[amt_result["WorkerId"] == w]["Answer.Identifier"].iloc[0]
        pc_id = subj_table[subj_table["worker_id"] == w]["identifier"].iloc[0]
    
        if amt_id != pc_id:
            checklist.append(w)

In [None]:
checklist

In [None]:
subj_table[subj_table["worker_id"].isin(checklist)]

In [None]:
amt_result[amt_result["WorkerId"].isin(checklist)]