In [38]:
# Preprocessing of the dataset crowd_data.tsv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the dataset
data = pd.read_csv('../dataset/crowd_data/crowd_data.tsv', sep='\t')
data.head(10)

Unnamed: 0,HITId,HITTypeId,Title,Reward,AssignmentId,WorkerId,AssignmentStatus,WorkTimeInSeconds,LifetimeApprovalRate,Input1ID,Input2ID,Input3ID,AnswerID,AnswerLabel,FixPosition,FixValue
0,1,7QT,Is this triple correct or incorrect?,$0.50,1,2133ICYWE97,Submitted,60,99%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
1,1,7QT,Is this triple correct or incorrect?,$0.50,2,2133U7HKDLO,Submitted,40,40%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,yes,yes
2,1,7QT,Is this triple correct or incorrect?,$0.50,3,928UJANWZ12,Submitted,50,98%,wd:Q11621,wdt:P2142,792910554,2.0,INCORRECT,,
3,1,7QT,Is this triple correct or incorrect?,$0.50,4,1726JMZQW,Submitted,80,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
4,1,7QT,Is this triple correct or incorrect?,$0.50,5,2134U7HKDMM,Submitted,2,70%,wd:Q11621,wdt:P2142,792910554,1.0,CORRECT,,
5,2,7QT,Is this triple correct or incorrect?,$0.50,6,2134U7HKDMM,Submitted,2,70%,wd:Q603545,wdt:P2142,4300000,2.0,INCORRECT,,
6,2,7QT,Is this triple correct or incorrect?,$0.50,7,2133ICYWE97,Submitted,120,99%,wd:Q603545,wdt:P2142,4300000,1.0,CORRECT,,
7,2,7QT,Is this triple correct or incorrect?,$0.50,8,928UJANWZ12,Submitted,60,98%,wd:Q603545,wdt:P2142,4300000,1.0,CORRECT,,
8,2,7QT,Is this triple correct or incorrect?,$0.50,9,1726JMZQW,Submitted,90,70%,wd:Q603545,wdt:P2142,4300000,1.0,CORRECT,,
9,2,7QT,Is this triple correct or incorrect?,$0.50,10,2133U7HKDLO,Submitted,40,40%,wd:Q603545,wdt:P2142,4300000,1.0,CORRECT,yes,yes


In [39]:
# Filter by LifetimeApprovalRate >= 70%
print('Initial length of the dataset: ', len(data))
print('Number of mini tasks: ', len(data['HITId'].unique()))
data['LifetimeApprovalRate'] = data['LifetimeApprovalRate'].str.replace('%', '')
data = data[pd.to_numeric(data['LifetimeApprovalRate']) > 70]
print('\nFinal length of the dataset: ', len(data))
print('Number of mini tasks: ', len(data['HITId'].unique()))

# remove "wd:" from the Inputs Ids
data['Input1ID'] = data['Input1ID'].str.replace('wd:', '')
data['Input2ID'] = data['Input2ID'].str.replace('wdt:', '') # Input2IDs are relations, therefore we delete the "wdt:"
data['Input3ID'] = data['Input3ID'].str.replace('wd:', '')

Initial length of the dataset:  305
Number of mini tasks:  61

Final length of the dataset:  162
Number of mini tasks:  61


In [40]:
# Step 1: Aggregate answers by majority voting and count AnswerID = 1 and 2
def majority_vote(answers):
    counts = answers.value_counts()
    if len(counts) > 1 and counts.iloc[0] == counts.iloc[1]:
        return 0  # No majority, returning 0
    else:
        return counts.idxmax()  # Return the AnswerID with the maximum count

aggregated_data = data.groupby('HITId')['AnswerID'].agg(majority_vote).reset_index()
aggregated_data.rename(columns={'AnswerID': 'MajorityAnswer'}, inplace=True)

# Count AnswerID = 1 and AnswerID = 2 for each HITId
answer_counts = data.pivot_table(index='HITId', columns='AnswerID', aggfunc='size', fill_value=0)
aggregated_data = aggregated_data.merge(answer_counts, left_on='HITId', right_index=True, how='left')

# Rename columns for clarity
aggregated_data.rename(columns={1: 'CountAnswerID1', 2: 'CountAnswerID2'}, inplace=True)

aggregated_data

Unnamed: 0,HITId,MajorityAnswer,CountAnswerID1,CountAnswerID2
0,1,0.0,1,1
1,2,1.0,2,0
2,3,0.0,1,1
3,4,2.0,0,2
4,5,1.0,2,0
...,...,...,...,...
56,57,1.0,2,1
57,58,2.0,0,3
58,59,1.0,2,1
59,60,1.0,2,1


In [41]:
print('Length of the aggregated dataset: ', len(aggregated_data))

Length of the aggregated dataset:  61


In [42]:
import statsmodels.api as sm
from statsmodels.stats.inter_rater import fleiss_kappa


# Reformatting data for Fleiss' kappa
# First, prepare the data by counting the number of each AnswerID for each HITTypeId
formatted_for_kappa = data.groupby(['HITTypeId', 'HITId', 'AnswerID']).size().unstack(fill_value=0)

# Compute Fleiss' kappa for each HITTypeId
kappa_results = {}
for hit_type, group in formatted_for_kappa.groupby(level=0):
    # Here, we assume each row is a HITId and the columns are counts of each AnswerID
    print(hit_type, group.shape)
    try:
        kappa = fleiss_kappa(group, method='fleiss')
        kappa_results[hit_type] = kappa
    except Exception as e:
        kappa_results[hit_type] = str(e)

# Display kappa results
kappa_results

7QT (21, 2)
8QT (20, 2)
9QT (20, 2)


{'7QT': 0.1428571428571428,
 '8QT': 0.04000000000000009,
 '9QT': 0.26339285714285726}

In [43]:
# Extension of aggregated_data with Entity, response and Fleiss' kappa
inputs_and_type = data[['HITId', 'Input1ID', 'Input2ID', 'Input3ID', 'HITTypeId']].drop_duplicates()
aggregated_data = pd.merge(aggregated_data, inputs_and_type, on='HITId', how='left')

# Add Fleiss' kappa to the aggregated data
aggregated_data['Kappa'] = aggregated_data['HITTypeId'].map(kappa_results)

# save this dataframe in the directory crowd_data as crowd_data_aggregated.csv
aggregated_data.to_csv('../dataset/crowd_data/crowd_data_aggregated.csv', index=False)

In [44]:
aggregated_data.head()

Unnamed: 0,HITId,MajorityAnswer,CountAnswerID1,CountAnswerID2,Input1ID,Input2ID,Input3ID,HITTypeId,Kappa
0,1,0.0,1,1,Q11621,P2142,792910554,7QT,0.142857
1,2,1.0,2,0,Q603545,P2142,4300000,7QT,0.142857
2,3,0.0,1,1,Q16911843,P577,2014-01-18,7QT,0.142857
3,4,2.0,0,2,Q132863,P2142,969023261,7QT,0.142857
4,5,1.0,2,0,Q1628022,P577,1951-01-01,7QT,0.142857
