In [1]:
## create ground truth label from batch results
import pandas as pd
import numpy as np
from os import listdir
from os.path import isfile, join

In [2]:
## Load the CSV files and save in 1 dataframe
csv_files = [f for f in listdir("batch_results_oracle") if isfile(join("batch_results_oracle", f))]

# all_results = pd.DataFrame(columns=["anchor_id", "anchor_label", image_id", "melanoma"])
all_results = pd.DataFrame()

for i in csv_files:
    temp_df = pd.read_csv("batch_results_oracle/"+i)
    all_results = all_results.append(temp_df, ignore_index=True)
    

# print(all_results.columns)

## Drop all rows we don't need
df_prep0 = all_results.drop(columns=['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.id1',
        'Input.image1_url', 'Input.image2_label', 'Input.image2_url','Input.image3_label', 'Input.image3_url', 'Approve', 'Reject'])

# Rename columns
df_prep0 = df_prep0.rename(columns={"Input.image1_title":"anchor_id", "Input.image1_label" : "anchor_label",
                                    "Input.image2_title":"item2", "Input.image3_title":"item3",
                                    "Answer.equal.label":"certainty", "Answer.equal":"answer"  })
print(df_prep0)

## Create the ground truth file
# Step 1: give image_id to picture that has been answered

# Step0b : Drop all "very uncertains"
# # Get names of indexes for which column Age has value 30
indexNames = df_prep0[df_prep0['certainty'] == "very uncertain"].index

# Delete these row indexes from dataFrame
df_prep0b = df_prep0.drop(indexNames)
print(df_prep0b)

df_prep1 = df_prep0b
## Find image ID based on the answer is A or N
df_prep1[["image_id"]] = df_prep1[["item2"]].where(df_prep1[["answer"]].values=="A")
## Because it is binary, we now all NaN is B (Image 3)
df_prep1.image_id.fillna(df_prep1.item3,inplace=True)
#print(df_prep1)

# Step 2: define the label equal to the anchor
df_prep2 = pd.DataFrame()
df_prep2[["image_id", "melanoma"]] = df_prep1[["image_id", "anchor_label"]]
df_prep2 = df_prep2.sort_values("image_id")
print(df_prep2)

# Step 3: Take average of all answers 
df_prep3 = df_prep2
df_prep3[["melanoma"]] = df_prep3["melanoma"].astype(float)
df_prep3 = df_prep3.groupby("image_id").mean()
print(df_prep3)

# Step 4: Round to nearest label {0,1}
df = df_prep3
df = df.round({'melanoma': 0})
print(df)




        anchor_id  anchor_label         item2         item3 answer  \
0    ISIC_0000484             1  ISIC_7228524  ISIC_9556334      B   
1    ISIC_0010227             0  ISIC_6355353  ISIC_0953317      B   
2    ISIC_0013845             1  ISIC_2609984  ISIC_6695831      A   
3    ISIC_0000081             0  ISIC_3065032  ISIC_7748311      B   
4    ISIC_0009898             1  ISIC_4710645  ISIC_3628451      B   
..            ...           ...           ...           ...    ...   
495  ISIC_0014603             0  ISIC_5216586  ISIC_8667793      B   
496  ISIC_0009993             1  ISIC_7186503  ISIC_4653503      B   
497  ISIC_0013578             1  ISIC_1648826  ISIC_7800750      B   
498  ISIC_0012735             0  ISIC_5829938  ISIC_5021861      B   
499  ISIC_0000463             1  ISIC_6671773  ISIC_6255113      A   

          certainty  
0      very certain  
1    very uncertain  
2      very certain  
3    very uncertain  
4    very uncertain  
..              ...  
495  

In [19]:
stop = int(384*0.2)
print(stop)
df_80top = df.iloc[0:76]
print(df_80top)
df_80bot = df.iloc[76:]
df_80top["melanoma"] = np.where(df_80top['melanoma']==0.0, 1.0, 0.0)
print(df_80top)

df_80 = df_80top.append(df_80bot)
print(df_80)

76
              melanoma
image_id              
ISIC_0149568       0.0
ISIC_0188432       0.0
ISIC_0236510       1.0
ISIC_0273594       1.0
ISIC_0274382       0.0
...                ...
ISIC_2027582       0.0
ISIC_2033237       0.0
ISIC_2071966       0.0
ISIC_2077116       0.0
ISIC_2146346       1.0

[76 rows x 1 columns]
              melanoma
image_id              
ISIC_0149568       1.0
ISIC_0188432       1.0
ISIC_0236510       0.0
ISIC_0273594       0.0
ISIC_0274382       1.0
...                ...
ISIC_2027582       1.0
ISIC_2033237       1.0
ISIC_2071966       1.0
ISIC_2077116       1.0
ISIC_2146346       0.0

[76 rows x 1 columns]
              melanoma
image_id              
ISIC_0149568       1.0
ISIC_0188432       1.0
ISIC_0236510       0.0
ISIC_0273594       0.0
ISIC_0274382       1.0
...                ...
ISIC_9818733       0.0
ISIC_9821002       1.0
ISIC_9910791       1.0
ISIC_9940466       0.0
ISIC_9955163       1.0

[384 rows x 1 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_80top["melanoma"] = np.where(df_80top['melanoma']==0.0, 1.0, 0.0)


In [20]:
## Save ground truth file to csv
df.to_csv("CROWDSIM80PCT_2021_GroundTruth.csv")