In [1]:
import pandas as pd
import numpy as np

# User Credibility Scores: Creating Task Scores
This notebook will include steps 2 and 3 from the User Monitoring Pipeline, which includes identifying the consensus answer from the IAA and Gold Standard data (step 2) and creating the corresponding task scores for users who completed this task (step 3).

## Hardcoded Evidence Schema
Most of this information will be in some sort of schema file (see file 'Evidence2021_05_19-Schema.csv'), but I'm not sure where the schema file is for this specific set of tasks. Thus, I hard coded it with the schema data from https://github.com/Goodly/PEUserMonitoring/blob/master/task-schema/Evidence.txt. Getting this information with the right schema file should be fairly straightforward.


In [2]:
scored_questions = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}

question_schema = {1:{'type':'select_one_nominal', 'num_choices':3},
           2:{'type':'select_all', 'num_choices':9},
           3:{'type':'select_one_nominal', 'num_choices':1},
           4:{'type':'select_one_ordinal', 'num_choices':6},
           5:{'type':'select_one_nominal', 'num_choices':5},
           6:{'type':'select_one_nominal', 'num_choices':3},
           7:{'type':'select_one_ordinal', 'num_choices':1},
           8:{'type':'select_one_ordinal', 'num_choices':5},
           9:{'type':'select_one_ordinal', 'num_choices':3},
           10:{'type':'select_one_ordinal', 'num_choices':5},
           11:{'type':'select_one_ordinal', 'num_choices':5},
           12:{'type':'select_one_ordinal', 'num_choices':4},
           13:{'type':'select_one_ordinal', 'num_choices':10},
           14:{'type':'select_one_ordinal', 'num_choices':10}}

## Preprocessing of IAA and Gold Standard Data

In [3]:
# read in the data
gold = pd.read_csv('evidence_eric/evidence_eric/Covid_Evidence2020_03_21.adjudicated-edb1510f-1923-4d6f-a678-95f53d752bea-Tags.csv')
iaa = pd.read_csv('evidence_eric/evidence_eric/Covid_Evidencev1.IAA-edb1510f-1923-4d6f-a678-95f53d752bea-Tags.csv')

# getting rid of some rows where the answer was invalid, probably represents some other metadata
iaa = iaa[iaa.answer_uuid.str.len() > 3]

In [4]:
# these are the only relevant columns for scoring for now, notice highlight data is not included here
cols = ['answer_uuid', 'question_Number', 'agreed_Answer']

# getting rid of some rows where the above columns were the same, this may represent different 
# highlights for the same question and answer?
gold = gold[cols].drop_duplicates()
iaa = iaa[cols].drop_duplicates()

Below cells just show the format of the preprocessed IAA and Gold Standard data.

In [5]:
gold.head(3)

Unnamed: 0,answer_uuid,question_Number,agreed_Answer
0,73d7a14a-9ec6-404c-b2b7-a55508af3b76,1,1
1,5a1fb1f4-d8b7-45c0-bce5-7d4c3b91c55f,2,1
3,ba2d1638-2509-4ce8-9130-39ea26d1d424,2,2


In [6]:
iaa.head(3)

Unnamed: 0,answer_uuid,question_Number,agreed_Answer
0,73d7a14a-9ec6-404c-b2b7-a55508af3b76,1,1
3,5a1fb1f4-d8b7-45c0-bce5-7d4c3b91c55f,2,1
4,ba2d1638-2509-4ce8-9130-39ea26d1d424,2,2


## Creating The Answer Key

In [7]:
# consensus answer key
consensus_answers = {}

def get_answer(question, answer_source):
    """
    Take in the question and the answer_source, either IAA or Gold Standard, and adds the
    converged consensus answer to the consensus_answer answer key. This will be an single
    int for select_one questions, or a list of ints for select_all questions.
    """
    question_type = question_schema[question]['type']
    
    if question_type == 'select_one_nominal' or question_type == 'select_one_ordinal':
        assert len(answer_source[answer_source.question_Number == question].agreed_Answer) == 1
        consensus_answers[question] = answer_source[answer_source.question_Number == question].agreed_Answer.iloc[0]
    elif question_type == 'select_all':
        consensus_answers[question] = list(answer_source[answer_source.question_Number == question].agreed_Answer)
    else:
        raise ValueError('Invalid question type')

In [8]:
# create a set of questions that the Gold Standard data determined converged
gold_consensus_questions = set(gold.question_Number)
# create a set of questions that the IAA data determined converged
iaa_consensus_questions = set(iaa.question_Number)

# uses get_answer function to fill in the consensus_answers answer key
for question in scored_questions:
    if question in gold_consensus_questions:
        get_answer(question, gold)
    elif question in iaa_consensus_questions:
        get_answer(question, iaa)
    else:
        consensus_answers[question] = -1

What the consensus key looks like:

In [9]:
consensus_answers

{1: 1,
 2: [1, 2, 3, 5],
 3: -1,
 4: 2,
 5: 5,
 6: '3',
 7: 1,
 8: 4,
 9: 1,
 10: 4,
 11: 4,
 12: 2}

## Scoring Users

In [10]:
def scoring_select_one_nominal(question, answer):
    """
    Takes in a question and the selected answer, returns a score of 0 if the consensus 
    answer is different, and 1 if the consensus answer is the same.
    """
    consensus_answer = consensus_answers[question]
    return int(consensus_answer == answer)

In [11]:
def scoring_select_one_ordinal(question, answer):
    """
    Takes in a question and the selected answer, returns a score between 0 and 1 depending
    on how far off the answer is from the consensus answer.
    """
    consensus_answer = consensus_answers[question]
    num_choices = question_schema[question]['num_choices']
    return 1 - (abs(answer - consensus_answer) / num_choices)

In [12]:
def scoring_select_all(question, answer_list):
    """
    Takes in a question and the selected answer, returns a score between 0 and 1 depending
    on the accuracy ((True Positive + True Negative) / Total) of the answer selections
    compared to the consensus answer selections.
    """
    answer_set = set(answer_list)
    consensus_answer_set = set(consensus_answers[question])
    num_choices = question_schema[question]['num_choices']
    
    total_correct = 0
    for answer in range(1, num_choices+1):
        if (answer in answer_set) and (answer in consensus_answer_set):
            total_correct += 1
        elif (answer not in answer_set) and (answer not in consensus_answer_set):
            total_correct += 1
        else:
            total_correct += 0
        
    return total_correct / num_choices

In [13]:
def scoring(row):
    """
    This is a Pandas apply function, to be applied on axis=1 (on each row).
    Makes a call to one of scoring_select_one_nominal, scoring_select_one_ordinal, and
    scoring_select_all depending on the type of question, returns the outputted score.
    
    An important note is that right now if neither IAA nor Gold Standard have a consensus
    answer for a question, the consensus_answers answer key will contain a -1 for that 
    question. I currently assume this question should not have been answered due to it
    being a child-question from an incorrectly answered parent question, so I score it
    """
    question = int(row['question_label'])
    answer_list = [int(i) for i in row['answer_label']]
    
    if consensus_answers[question] == -1:
        return 0
    
    question_type = question_schema[question]['type']
    if question_type == 'select_one_nominal':
        return scoring_select_one_nominal(question, answer_list[0])
    elif question_type == 'select_one_ordinal':
        return scoring_select_one_ordinal(question, answer_list[0])
    elif question_type == 'select_all':
        return scoring_select_all(question, answer_list)
    else:
        raise ValueError('Invalid question type')

In [14]:
# read in the datahunt
df_full = pd.read_csv('evidence_eric/evidence_eric/Covid_Evidencev1-Task-2224-DataHunt.csv')

In [15]:
# narrow down the datahunt to the relevant columns for scoring, getting rid of some rows
# where the data for the below columns were the same, this may represent different highlights 
# for the same question and answer? not certain.
df = df_full[['contributor_uuid', 'question_label', 'answer_label']].drop_duplicates()

# the question and answer labels in the datahunt are in the form 'T1.QX' and 'T1.QX.AX'
# the below lines strip down to only question number and answer number
df['question_label'] = df['question_label'].str.split('Q').str[1].astype(int)
df['answer_label'] = df['answer_label'].str.split('A').str[1]

In [16]:
# we want to groupby contributor_uuid and question_label to get all the answers a user
# selected for a particular question, to account for select_all questions. Now, the
# granularity of df_grouped will be one row per contributor answering a question.
df_grouped = df.groupby(['contributor_uuid', 'question_label']).agg(list).reset_index()

In [17]:
# we only want to score the rows with scored questions (not survey questions like 13 and 14)
# so we'll filter those out
df_grouped = df_grouped[df_grouped.question_label.isin(scored_questions)]

In [18]:
# using the scoring function defined above, we'll create a new column containing the scores
# for each contributor answering a question.
df_grouped['score'] = df_grouped.apply(scoring, axis=1)

This is the current format of df_grouped.

In [19]:
df_grouped.head(3)

Unnamed: 0,contributor_uuid,question_label,answer_label,score
0,00f548b7-6b63-4b47-828e-8e416b6ca0e2,1,[1],1.0
1,00f548b7-6b63-4b47-828e-8e416b6ca0e2,2,"[3, 5, 8, 4]",0.555556
2,00f548b7-6b63-4b47-828e-8e416b6ca0e2,3,[1],0.0


In [20]:
# lastly, we want to get the average score for all task responses, this will be their
# task score. this is done by a simple groupby on contributor_uuid and mean() aggregate function
task_scores = df_grouped[['contributor_uuid', 'score']].groupby('contributor_uuid').mean()

This is what the final task_scores output will look like:

In [21]:
task_scores

Unnamed: 0_level_0,score
contributor_uuid,Unnamed: 1_level_1
00f548b7-6b63-4b47-828e-8e416b6ca0e2,0.603241
070268de-067c-463b-9ad3-5c88292d881e,0.878889
082a8363-a579-41b4-8918-c166fec3a3a4,0.513333
09df3ada-e5a8-4419-b78a-e0d1e9b37484,0.277778
09f279ac-1c34-4a84-8972-3d92b93605a7,0.361111
0c22ce7c-4641-4bb1-97f4-7a7355f70f25,0.831481
0e51ab2d-1a03-4d18-be33-fd21a829d19b,0.75463
1b69eee8-ab95-49dd-8979-9fff7655964d,0.277778
21ffd986-c219-43a0-b82f-4cc460da628d,0.831481
24640f45-b90b-40dc-a848-9e03fdfbbf91,0.588889


The below value is the number of rows processed in this datahunt, to be used for updating the datahunt tracking table.

In [22]:
len(df_full)

773