In [1]:
## create ground truth label from batch results
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join


In [37]:
## Load the CSV files and save in 1 dataframe
csv_files = [f for f in listdir("batch_results") if isfile(join("batch_results", f))]

# all_results = pd.DataFrame(columns=["anchor_id", "anchor_label", image_id", "melanoma"])
all_results = pd.DataFrame()

for i in csv_files:
    temp_df = pd.read_csv("batch_results/"+i)
    all_results = all_results.append(temp_df, ignore_index=True)
    

# print(all_results.columns)

# Combine answers columns to deal with bug in MTurk split columns:
all_results["Answer_label"] = all_results['Answer.equal.label'].fillna(all_results['Answer.equal'])
all_results["Answer_howMuch"] = all_results['Answer.howMuch'].fillna(all_results['Answer.howMuch.label'])

## Drop all col we don't need
df_prep0 = all_results.drop(columns=['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.id1',
       'Input.id2', 'Input.image1_url', 'Input.image2_label',
       'Input.image2_url', 'Input.image3_label', 'Answer.howMuch', 'Answer.howMuch.label',
       'Input.image3_url','Answer.equal.label','Answer.equal','Approve', 'Reject'])


# Rename columns
df_prep0 = df_prep0.rename(columns={"Input.image1_title":"anchor_id", "Input.image1_label" : "anchor_label", "Input.image2_title":"item2", "Input.image3_title":"item3", "Answer_label":"answer", "Answer_howMuch":'howMuch' })
print(df_prep0)

## Create the ground truth file
# Step 1: give image_id to picture that has been answered

df_prep1 = df_prep0
## Find image ID based on the answer is A or N
df_prep1[["image_id"]] = df_prep1[["item2"]].where(df_prep1[["answer"]].values=="A")
## Because it is binary, we now all NaN is B (Image 3)
df_prep1.image_id.fillna(df_prep1.item3,inplace=True)
#print(df_prep1)

#Drop all below a certain howMuch value:
df_prep1b = df_prep1[df_prep1.howMuch != "very uncertain"]
df_prep1b = df_prep1b[df_prep1b.howMuch != "uncertain"]

#See number of each id's in the list.. we want an average of >3
count_check = df_prep1b['image_id'].value_counts()
df_prep1c = df_prep1b.join(count_check, on="image_id", how = "left", rsuffix ="_count")

# Drop all columns with a value lower than 3
df_prep1d = df_prep1c[df_prep1c.image_id_count >= 3]
print(df_prep1d)



# Step 2: define the label equal to the anchor
df_prep2 = pd.DataFrame()
df_prep2[["image_id", "melanoma"]] = df_prep1d[["image_id", "anchor_label"]]
df_prep2 = df_prep2.sort_values("image_id")
print(df_prep2)

# Step 3: Take average of all answers 
df_prep3 = df_prep2
df_prep3[["melanoma"]] = df_prep3["melanoma"].astype(float)
df_prep3 = df_prep3.groupby("image_id").mean()
print(df_prep3)

# Step 4: Round to nearest label {0,1}
df = df_prep3
df = df.round({'melanoma': 0})
print(df)


        anchor_id  anchor_label         item2         item3 answer  \
0    ISIC_0014324           1.0  ISIC_4874581  ISIC_2155898      A   
1    ISIC_0014324           1.0  ISIC_4874581  ISIC_2155898      B   
2    ISIC_0000484           1.0  ISIC_7228524  ISIC_9556334      A   
3    ISIC_0000484           1.0  ISIC_7228524  ISIC_9556334      B   
4    ISIC_0000484           1.0  ISIC_7228524  ISIC_9556334      B   
..            ...           ...           ...           ...    ...   
144  ISIC_0014156           1.0  ISIC_6228963  ISIC_3863802      B   
145  ISIC_0014156           1.0  ISIC_6228963  ISIC_3863802      B   
146  ISIC_0000351           0.0  ISIC_8145348  ISIC_7844825      B   
147  ISIC_0000351           0.0  ISIC_8145348  ISIC_7844825      B   
148  ISIC_0000351           0.0  ISIC_8145348  ISIC_7844825      B   

            howMuch  
0           neutral  
1         uncertain  
2           neutral  
3    very confident  
4    very confident  
..              ...  
144  

In [38]:
## Save ground truth file to csv
df.to_csv("CROWDSIM-2021_GroundTruth.csv")

In [None]:
## Create analysis file
df_analysis = df.join(all_results, on)


df_analysis.to_csv("Analysis_batch_results")