In [2]:
import pandas as pd

regular_df = pd.read_csv("regular_mturk_responses.csv")
class_df = pd.read_csv("class_mturk_results.csv")

In [3]:
regular_df.columns

Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Answer.Fact.fact',
       'Answer.Myth.myth', 'Answer.No.no', 'Answer.Yes.yes', 'Answer.aiName',
       'Answer.factSource', 'Answer.sentence', 'Approve', 'Reject'],
      dtype='object')

In [4]:
class_df.columns

Index(['HITId', 'HITTypeId', 'Title', 'Description', 'Keywords', 'Reward',
       'CreationTime', 'MaxAssignments', 'RequesterAnnotation',
       'AssignmentDurationInSeconds', 'AutoApprovalDelayInSeconds',
       'Expiration', 'NumberOfSimilarHITs', 'LifetimeInSeconds',
       'AssignmentId', 'WorkerId', 'AssignmentStatus', 'AcceptTime',
       'SubmitTime', 'AutoApprovalTime', 'ApprovalTime', 'RejectionTime',
       'RequesterFeedback', 'WorkTimeInSeconds', 'LifetimeApprovalRate',
       'Last30DaysApprovalRate', 'Last7DaysApprovalRate', 'Input.input',
       'Answer.Fact.fact', 'Answer.Myth.myth', 'Answer.No.no',
       'Answer.Yes.yes', 'Answer.aiName', 'Answer.factSource',
       'Answer.sentence', 'Approve', 'Reject'],
      dtype='object')

In [5]:
combined_df = pd.concat([regular_df, class_df], join="inner")

In [6]:
combined_df.head()

Unnamed: 0,HITId,HITTypeId,Title,Description,Keywords,Reward,CreationTime,MaxAssignments,RequesterAnnotation,AssignmentDurationInSeconds,...,Last7DaysApprovalRate,Answer.Fact.fact,Answer.Myth.myth,Answer.No.no,Answer.Yes.yes,Answer.aiName,Answer.factSource,Answer.sentence,Approve,Reject
0,3VDI8GSXBZRB8E7KHSKNCBFXJZ68GB,3QH037L2C735EFYINNDZFLZ78H7WAX,Submit an interesting fact or myth,"Please submit an interesting fact or myth, and...",survey,$0.06,Mon Nov 25 12:33:09 PST 2024,100,BatchId:5274683;OriginalHitTemplateId:928390912;,3600,...,0% (0/0),True,False,False,True,ChatGPT,"If the sentence is classified as a fact, the u...",A user enters the sentence they wish to classi...,,
1,3VDI8GSXBZRB8E7KHSKNCBFXJZ68GB,3QH037L2C735EFYINNDZFLZ78H7WAX,Submit an interesting fact or myth,"Please submit an interesting fact or myth, and...",survey,$0.06,Mon Nov 25 12:33:09 PST 2024,100,BatchId:5274683;OriginalHitTemplateId:928390912;,3600,...,0% (0/0),True,False,False,True,Would you like to include any guidance for use...,Would you like to make this step mandatory or ...,Would you like assistance structuring the rema...,,
2,3VDI8GSXBZRB8E7KHSKNCBFXJZ68GB,3QH037L2C735EFYINNDZFLZ78H7WAX,Submit an interesting fact or myth,"Please submit an interesting fact or myth, and...",survey,$0.06,Mon Nov 25 12:33:09 PST 2024,100,BatchId:5274683;OriginalHitTemplateId:928390912;,3600,...,0% (0/0),False,True,True,False,none,https://news.uthscsa.edu/are-these-common-beli...,Cold weather can make you sick.,,
3,3VDI8GSXBZRB8E7KHSKNCBFXJZ68GB,3QH037L2C735EFYINNDZFLZ78H7WAX,Submit an interesting fact or myth,"Please submit an interesting fact or myth, and...",survey,$0.06,Mon Nov 25 12:33:09 PST 2024,100,BatchId:5274683;OriginalHitTemplateId:928390912;,3600,...,0% (0/0),True,False,True,False,Not AI generated,https://www.today.com/life/inspiration/interes...,A group of owls is called a parliament.,,
4,3VDI8GSXBZRB8E7KHSKNCBFXJZ68GB,3QH037L2C735EFYINNDZFLZ78H7WAX,Submit an interesting fact or myth,"Please submit an interesting fact or myth, and...",survey,$0.06,Mon Nov 25 12:33:09 PST 2024,100,BatchId:5274683;OriginalHitTemplateId:928390912;,3600,...,0% (0/0),True,False,False,True,chargpt,https://worker.mturk.com/projects/3SBA6YTBZN6J...,yes i want classify,,


In [7]:
fact_source_check = combined_df[
    (combined_df["Answer.Fact.fact"] == True) & (combined_df["Answer.factSource"].str.strip() == "")
]
assert len(fact_source_check) == 0

In [8]:
myth_fact_opp_check = combined_df[
    combined_df["Answer.Fact.fact"] == combined_df["Answer.Myth.myth"]
]
# drop rows from myth_fact_opp_check from combined_df
combined_df = combined_df.drop(myth_fact_opp_check.index)

In [9]:
inconsistent_fact_myth = combined_df.groupby("Answer.sentence").filter(
    lambda group: (group["Answer.Fact.fact"].nunique() > 1) or (group["Answer.Myth.myth"].nunique() > 1)
)
assert len(inconsistent_fact_myth) == 0

In [10]:
qc_df = pd.read_csv("mturk_qc_raw.csv")
len(qc_df)

1300

In [11]:
myth_fact_opp_check = qc_df[
    qc_df["Answer.is_fact.fact"] == qc_df["Answer.is_myth.myth"]
]
qc_df = qc_df.drop(myth_fact_opp_check.index)

In [21]:
merged_df = combined_df.merge(
    qc_df,
    left_on="Answer.sentence",
    right_on="Input.input",
    how="left"
)

result_df = merged_df.groupby("Answer.sentence").agg(
    {
        "Answer.Fact.fact": lambda x: x.mode().iloc[0],
        "Answer.factSource": list,
        "Answer.aiName": "first",
        "Answer.interest": "mean",
        "Answer.is_fact.fact": list,
        "Answer.is_myth.myth": list,    
    }
).reset_index()
result_df.head(5)

Unnamed: 0,Answer.sentence,Answer.Fact.fact,Answer.factSource,Answer.aiName,Answer.interest,Answer.is_fact.fact,Answer.is_myth.myth
0,"""Artificial intelligence is the simulation of ...",True,[Chat GPT],ChapGPT,,[nan],[nan]
1,"""Drinking eight glasses of water a day is esse...",False,[Source: Mayo Clinic (or any other credible so...,chatgpt,,[nan],[nan]
2,"""Drinking eight glasses of water a day is nece...",False,"[""National Academies of Sciences, Engineering,...",CHATGPT,,[nan],[nan]
3,"""The Phoenix is ​​said to rise from its own as...",False,[https://copilot.microsoft.com/chats/SFPA7VjL6...,ChatGPT,,[nan],[nan]
4,99% of gamblers quit right before they win it ...,False,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",,50.85,"[False, False, False, False, True, True, True,...","[True, True, True, True, False, False, False, ..."


In [22]:
# replace all [nan] with []
import numpy as np
result_df = result_df.map(lambda x: [] if isinstance(x, list) and all(isinstance(i, (int, float)) and np.isnan(i) for i in x) else x)

In [23]:
result_df["Answer.sentence"] = result_df["Answer.sentence"].str.strip('"')
result_df["Answer.sentence"] = result_df["Answer.sentence"].str.strip('')
result_df["Answer.sentence"] = result_df["Answer.sentence"].str.strip('"')
result_df["Answer.factSource"] = result_df["Answer.factSource"].map(lambda x: [i.strip('"') for i in x])
result_df["Answer.aiName"] = result_df["Answer.aiName"].str.strip('"')

In [24]:
len(result_df)

172

In [25]:
# show types of result_df
result_df.dtypes

Answer.sentence         object
Answer.Fact.fact          bool
Answer.factSource       object
Answer.aiName           object
Answer.interest        float64
Answer.is_fact.fact     object
Answer.is_myth.myth     object
dtype: object

In [34]:
result_df.head(5)
# show all result_df where Answer.factSource is not a list
result_df[result_df["Answer.is_myth.myth"].map(lambda x: not isinstance(x, list))]

Unnamed: 0,Answer.sentence,Answer.Fact.fact,Answer.factSource,Answer.aiName,Answer.interest,Answer.is_fact.fact,Answer.is_myth.myth


In [35]:
# dump result_df to a csv file
result_df.to_csv("mturk_aggregated_results.csv", index=False, sep='|')