In [49]:
import pandas as pd
from collections import Counter
from functools import reduce
import json

## Load MC quiz data

In [2]:
url = "https://docs.google.com/spreadsheets/d/1JNtLiQyLwuYOi2t14nYbaZAhneuRuGQGfmIJJY2WKlE/export?format=csv&gid=590588681"
df = pd.read_csv(url, dtype=str)

In [51]:
questions = [
        {
            "lexicalized" : "What language(s) does Vladimir Putin speak, write, or sign with?",
            "entity": "Q7747",
            "class" : "Q5",
            "prop" : "P1412",
            "wd_units" : None,
            "ans_to_qnode" : {"English": "Q1860",
                              "German": "Q188",
                              "Russian": "Q7737",
                              "Swedish": "Q9027",
                              "Spanish": "Q1321"},
            "gt_wd" : {"Russian", "English", "German", "Swedish"},
            "gt_irl" : {"Russian", "English", "German", "Swedish"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) has John Oliver had?",
            "entity": "Q1701254",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {"Film Actor": "Q10800557",
                              "Writer": "Q36180",
                              "Television Actor": "Q10798782",
                              "Television Presenter": "Q947873",
                              "Film Producer": "Q3282637"
                             },
            "gt_wd" : {"Film Actor", "Television Actor", "Writer", "Television Presenter"},
            "gt_irl" : {"Film Actor", "Television Actor", "Writer", "Television Presenter"},
            "single_answer" : False
        },
        {
            "lexicalized" : "Please select the country or countries in which Roger Federer possesses citizenship",
            "entity": "Q1426",
            "class" : "Q5",
            "prop" : "P27",
            "wd_units" : None,
            "ans_to_qnode" : {'South Africa': "Q258",
                              'Switzerland': "Q39",
                              'South Korea': "Q884",
                              'France': "Q142",
                              'United States of America': "Q30"
                             },
            "gt_wd" : {"South Africa", "Switzerland"},
            "gt_irl" : {"South Africa", "Switzerland"},
            "single_answer" : False
        },
        {
            "lexicalized" : "How many children does Arnold Schwarzenegger have?",
            "entity": "Q2685",
            "class" : "Q5",
            "prop" : "P1971",
            "wd_units" : "",
            "ans_to_qnode" : {"2 or fewer": "0.0-2.0",
                              "3":"2.0-3.0",
                              "4":"3.0-4.0",
                              "5 or more": "4.0-210.0"
                             },
            "gt_wd" : {"5 or more"},
            "gt_irl" : {"5 or more"},
            "single_answer" : True
        },
        {
            "lexicalized" : "What is Boris Johnson's place of birth?",
            "entity": "Q180589",
            "class" : "Q5",
            "prop" : "P19",
            "wd_units" : None,
            "ans_to_qnode" : {"New York City": "Q60",
                              "Paris": "Q90",
                              'Rome': "Q220",
                              'Hamburg': "Q1055",
                              'London': "Q84"},
            "gt_wd" : {"New York City"},
            "gt_irl" : {"New York City"},
            "single_answer" : True
        },
        {
            "lexicalized" : "What occupation(s) has Eminem had?",
            "entity": "Q5608",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Author': "Q482980",
                              'Film Producer': "Q3282637",
                              'Entrepreneur': "Q131524",
                              'Singer-songwriter': "Q488205",
                              'Painter': "Q1028181"},
            "gt_wd" : {"Author", "Film Producer", "Singer-songwriter", "Entrepreneur"},
            "gt_irl" : {"Author", "Film Producer", "Singer-songwriter", "Entrepreneur"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) has George W. Bush had?",
            "entity": "Q207",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Rugby Union Player': "Q14089670",
                              'Politician': "Q82955",
                              'Military Officer': "Q189290",
                              'Painter': "Q1028181",
                              'Singer': "Q177220"
                             },
            "gt_wd" : {"Rugby Union Player", "Politician", "Military Officer", "Painter"},
            "gt_irl" : {"Rugby Union Player", "Politician", "Military Officer", "Painter"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) has Beyoncé had?",
            "entity": "Q36153",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Model': "Q4610556",
                              'Voice Actor': "Q2405480",
                              'Entrepreneur': "Q131524",
                              'Singer-songwriter': "Q488205",
                              'Dancer': "Q5716684"},
            "gt_wd" : {"Model", "Voice Actor", "Singer-songwriter", "Entrepreneur", "Dancer"},
            "gt_irl" : {"Model", "Voice Actor", "Singer-songwriter", "Entrepreneur", "Dancer"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What is Donald Trump's mass in pounds circa. 2019?",
            "entity": "Q22686",
            "class" : "Q5",
            "prop" : "P2067",
            "wd_units" : "Q100995",
            "ans_to_qnode" : {"> 1 and <= 181": "1.0-181.0",
                              "> 181 and <= 195": "181.0-195.0",
                              "> 195 and <= 210": "195.0-210.0",
                              "> 210 and <= 240": "210.0-240.0",
                              "> 240": "240.0-555.0",
                             },
            "gt_wd" : {"> 240"},
            "gt_irl" : {"> 240"},
            "single_answer" : True
        },
        {
            "lexicalized" : "What political party/parties has Donald Trump been a member of?",
            "entity": "Q22686",
            "class" : "Q5",
            "prop" : "P102",
            "wd_units" : None,
            "ans_to_qnode" : {'Democratic Party': "Q29552",
                              'Republican Party': "Q29468",
                              'Communist Party of the Soviet Union': "Q79854",
                              'National Socialist German Workers\' Party': "Q7320",
                              'Social Democratic Party of Germany': "Q49768"
                             },
            "gt_wd" : {"Democratic Party", "Republican Party"},
            "gt_irl" : {"Democratic Party", "Republican Party"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) has John Cena had?",
            "entity": "Q44437",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Musician': "Q639669",
                              'Voice Actor': "Q2405480",
                              'Writer': "Q36180",
                              'Politician': "Q82955",
                              'Chess Player': "Q10873124"},
            "gt_wd" : {"Musician", "Voice Actor", "Writer"},
            "gt_irl" : {"Musician", "Voice Actor", "Writer"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What sport(s) has John Cena participated in?",
            "entity": "Q44437",
            "class" : "Q5",
            "prop" : "P641",
            "wd_units" : None,
            "ans_to_qnode" : {'American Football': "Q41323",
                              'Baseball': "Q5369",
                              'Amateur Wrestling': "Q838089",
                              'Boxing': "Q32112",
                              'Rugby': "Q5378",
                             },
            "gt_wd" : {"American Football"},
            "gt_irl" : {"American Football"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) has Kanye West had?",
            "entity": "Q15935",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Architect': "Q42973",
                              'Film Director': "Q2526255",
                              'Politician': "Q82955",
                              'Singer': "Q177220",
                              'Sport Cyclist': "Q2309784"},
            "gt_wd" : {"Film Director", "Politician", "Singer", "Architect"},
            "gt_irl" : {"Film Director", "Politician", "Singer"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) has LeBron James had?",
            "entity": "Q36159",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Television Actor': "Q10798782",
                              'Basketball Player': "Q3665646",
                              'Screenwriter': "Q28389",
                              'Researcher': "Q1650915",
                              'Writer': "Q36180"
                             },
            "gt_wd" : {"Television Actor", "Basketball Player", "Screenwriter"},
            "gt_irl" : {"Television Actor", "Basketball Player", "Screenwriter", "Writer"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What occupation(s) did Leonardo Da Vinci have?",
            "entity": "Q762",
            "class" : "Q5",
            "prop" : "P106",
            "wd_units" : None,
            "ans_to_qnode" : {'Diplomat': "Q193391",
                              'Zoologist': "Q350979",
                              'Chemist': "Q593644",
                              'Painter': "Q1028181",
                              'Engineer': "Q81096"
                             },
            "gt_wd" : {"Zoologist", "Chemist", "Painter", "Engineer", "Diplomat"},
            "gt_irl" : {"Zoologist", "Chemist", "Painter", "Engineer"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What language(s) did Ruth Bader Ginsburg speak, write, or sign?",
            "entity": "Q11116",
            "class" : "Q5",
            "prop" : "P1412",
            "wd_units" : None,
            "ans_to_qnode" : {'Swedish': "Q9027",
                              'English': "Q1860",
                              'Russian': "Q7737",
                              'French': "Q150",
                              'German': "Q188"
                             },
            "gt_wd" : {"Swedish"},
            "gt_irl" : {"Swedish", "English"},
            "single_answer" : False
        },
        {
            "lexicalized" : "What is the life expectancy in years of Australia circa. 2016?",
            "entity": "Q408",
            "class" : "Q3624078",
            "prop" : "P2250",
            "wd_units" : "Q577",
            "ans_to_qnode" : {"76.6 to 85.4": "76.59756-85.41707",
                              "63.3 to 69.9": "63.33-69.862",
                              "70.2 to 73.8": "70.197-73.82683",
                              "51.8 to 63.2": "51.835-63.238",
                              "73.9 to 76.6": "73.861-76.577"
                             },
            "gt_wd" : {"76.6 to 85.4"},
            "gt_irl" : {"76.6 to 85.4"},
            "single_answer" : True
        },
        {
            "lexicalized" : "What is the retirement age in Colombia? The answer for either men or women will be accepted.",
            "entity": "Q739",
            "class" : "Q3624078",
            "prop" : "P3001",
            "wd_units" : "Q24564698",
            "ans_to_qnode" : {"55.0 to 60.0" : "55.0-60.0",
                              "62.5 to 65.0" : "62.5-65.0",
                              "60.0 to 62.5" : "60.0-62.5",
                              "65.25 to 67.0" : "65.25-67.0"
                             },
            "gt_wd" : {"55.0 to 60.0", "60.0 to 62.5"},
            "gt_irl" : {"55.0 to 60.0", "60.0 to 62.5"},
            "single_answer" : True
        },
        {
            "lexicalized" : "What percentage of the territory of Canada inside its coast line and international boundaries is water?",
            "entity": "Q16",
            "class" : "Q3624078",
            "prop" : "P2927",
            "wd_units" : "Q11229",
            "ans_to_qnode" : {"8.4 to 27.9": "8.4-27.9",
                              "1.5 to 2.5": "1.5-2.5",
                              "0.0 to 0.2": "0.0-0.2",
                              "2.6 to 5.7": "2.6-5.7",
                              "0.3 to 1.4": "0.3-1.444"
                             },
            "gt_wd" : {"8.4 to 27.9"},
            "gt_irl" : {"8.4 to 27.9"},
            "single_answer" : True
        },
        {
            "lexicalized" : 'Which entity or entities is/are depicted in the painting "Mona Lisa"?',
            "entity": "Q12418",
            "class" : "Q3305213",
            "prop" : "P180",
            "wd_units" : None,
            "ans_to_qnode" : {'Bridge': "Q12280",
                              'Mountain': "Q8502",
                              'Sky': "Q527",
                              'Landscape': "Q107425",
                              'Virgin Mary': "Q345"
                             },
            "gt_wd" : {"Bridge", "Mountain", "Sky", "Landscape"},
            "gt_irl" : {"Bridge", "Mountain", "Sky", "Landscape"},
            "single_answer" : False
        },
        {
            "lexicalized" : 'Which genre(s) is the painting "Guernica" considered?',
            "entity": "Q175036",
            "class" : "Q3305213",
            "prop" : "P136",
            "wd_units" : None,
            "ans_to_qnode" : {'History Painting': "Q742333",
                              'Portrait': "Q134307",
                              'Cityscape': "Q1935974",
                              'Landscape Art': "Q191163",
                              'Self-Portrait': "Q192110"
                             },
            "gt_wd" : {"History Painting"},
            "gt_irl" : {"History Painting"},
            "single_answer" : False
        },
        {
            "lexicalized" : 'Which materials were used in the painting "The Scream"?',
            "entity": "Q18891156",
            "class" : "Q3305213",
            "prop" : "P186",
            "wd_units" : None,
            "ans_to_qnode" : {'Cardboard': "Q389782",
                              'Tempera': "Q175166",
                              'Oil Paint': "Q296955",
                              'Paper': "Q11472",
                              'Canvas': "Q12321255|Q4259259"
                             },
            "gt_wd" : {"Cardboard", "Tempera"},
            "gt_irl" : {"Cardboard", "Tempera", "Oil Paint"},
            "single_answer" : False
        },
        {
            "lexicalized" : 'Which genre(s) is the film "The Princess Bride" considered?',
            "entity": "Q506418",
            "class" : "Q11424",
            "prop" : "P136",
            "wd_units" : None,
            "ans_to_qnode" : {'Film Based on a Novel': "Q52207399",
                              'Romantic Comedy': "Q860626",
                              'Fantasy Film': "Q157394",
                              'Drama': "Q130232",
                              'Musical Film': "Q842256"
                             },
            "gt_wd" : {"Film Based on a Novel", "Romantic Comedy", "Fantasy Film"},
            "gt_irl" : {"Film Based on a Novel", "Romantic Comedy", "Fantasy Film"},
            "single_answer" : False
        },
        {
            "lexicalized" : 'What was the total revenue in euros of the business "Adidas" circa. 2014?',
            "entity": "Q3895",
            "class" : "Q4830453",
            "prop" : "P2139",
            "wd_units" : "Q4916",
            "ans_to_qnode" : {"1 to 108,589,000" : "1.0-108589000.0",
                              "438,000,000 to 1,590,000,000" : "438000000.0-1590000000.0",
                              "113,000,000 to 427,800,000" : "113000000.0-427800000.0",
                              "6,764,000,000 to 217,267,000,000" : "6764000000.0-217267000000.0",
                              "1,610,000,000 to 6,745,000,000" : "1610000000.0-6745000000.0"
                             },
            "gt_wd" : {"1 to 108,589,000"},
            "gt_irl" : {"6,764,000,000 to 217,267,000,000"},
            "single_answer" : True
        },
]

In [52]:
# Recording predicted answers for each question
for i in range(4,len(df.columns)):
    q = questions[i-4]
    # make sure this is the right question
    assert df.columns[i] == q['lexicalized'], f"df.columns[{i}]: {df.columns[i]}\nq['lexicalized']: {q['lexicalized']}"
    # collect predicted answers
    pred_counts = Counter(reduce(lambda l1, l2: l1 + l2, (preds.split(", ") for preds in df.iloc[:,i])))
    # make sure all answers are recognized
    for ans in pred_counts.keys():
        assert ans in q["ans_to_qnode"], f"{ans} not in answer list for \'{q['lexicalized']}\'"
    # fill in 0-counts
    for ans in q["ans_to_qnode"]:
        if ans not in pred_counts:
            pred_counts[ans] = 0
    q["pred_counts"] = pred_counts
    

## Now clean up format...
    

In [53]:
num_participants = len(df)
for q in questions:
    # does this answer have numeric-valued answers?
    q["is_numeric_answer"] = list(q["ans_to_qnode"].values())[0][0] != "Q"
    # change "prop" to "property"
    q["property"] = q["prop"]
    del q["prop"]
    # change "single_answer" to "is_single_answer"
    q["is_single_answer"] = q["single_answer"]
    del q["single_answer"]
    # put info about answers into single list of dicts
    answers = []
    for ans_en, qnodes in q["ans_to_qnode"].items():
        # readable answer
        ans_dict = {"lexicalized": ans_en}
        # qnode value(s) -- we'll remove numeric values from these later
        ans_dict["qnodes"] = qnodes.split('|')
        # inferred surprise
        ans_dict["gt_surprise"] = q["pred_counts"][ans_en] / num_participants
        # truth value according to WD or Google
        ans_dict["truth_value_wd"] = ans_en in q["gt_wd"]
        ans_dict["truth_value_fact_checked"] = ans_en in q["gt_irl"]
        answers.append(ans_dict)
    q["answers"] = answers
    # remove un-needed keys
    del q["ans_to_qnode"]
    del q["gt_wd"]
    del q["gt_irl"]
    del q["pred_counts"]

Add low bound and high bound info

In [54]:
for q in questions:
    if q["is_numeric_answer"]:
        for answer in q["answers"]:
            print(answer["lexicalized"])

2 or fewer
3
4
5 or more
> 1 and <= 181
> 181 and <= 195
> 195 and <= 210
> 210 and <= 240
> 240
76.6 to 85.4
63.3 to 69.9
70.2 to 73.8
51.8 to 63.2
73.9 to 76.6
55.0 to 60.0
62.5 to 65.0
60.0 to 62.5
65.25 to 67.0
8.4 to 27.9
1.5 to 2.5
0.0 to 0.2
2.6 to 5.7
0.3 to 1.4
1 to 108,589,000
438,000,000 to 1,590,000,000
113,000,000 to 427,800,000
6,764,000,000 to 217,267,000,000
1,610,000,000 to 6,745,000,000


In [55]:
num_answer_to_bounds = {
    "2 or fewer": (None, 2),
    "3": (3, 3),
    "4": (4, 4),
    "5 or more": (5, None),
    "> 1 and <= 181": (2, 181),
    "> 181 and <= 195": (182, 195),
    "> 195 and <= 210": (196, 210),
    "> 210 and <= 240": (211, 240),
    "> 240": (241, None),
    "76.6 to 85.4": (76.6, 85.4),
    "63.3 to 69.9": (63.3, 69.9),
    "70.2 to 73.8": (70.2, 73.8),
    "51.8 to 63.2": (51.8, 63.2),
    "73.9 to 76.6": (73.9, 76.6),
    "55.0 to 60.0": (55.0, 60.0),
    "62.5 to 65.0": (62.5, 65.0),
    "60.0 to 62.5": (60.0, 62.5),
    "65.25 to 67.0": (65.25, 67.0),
    "8.4 to 27.9": (8.4, 27.9),
    "1.5 to 2.5": (1.5, 2.5),
    "0.0 to 0.2": (0.0, 0.2),
    "2.6 to 5.7": (2.6, 5.7),
    "0.3 to 1.4": (0.3, 1.4),
    "1 to 108,589,000": (1, 108589000),
    "438,000,000 to 1,590,000,000": (438000000, 1590000000),
    "113,000,000 to 427,800,000": (113000000, 427800000),
    "6,764,000,000 to 217,267,000,000": (6764000000, 217267000000),
    "1,610,000,000 to 6,745,000,000": (1610000000, 6745000000),
}

for q in questions:
    if q["is_numeric_answer"]:
        for answer in q["answers"]:
            answer["numeric_bounds"] = num_answer_to_bounds[answer["lexicalized"]]

In [56]:
questions

[{'lexicalized': 'What language(s) does Vladimir Putin speak, write, or sign with?',
  'entity': 'Q7747',
  'class': 'Q5',
  'wd_units': None,
  'is_numeric_answer': False,
  'property': 'P1412',
  'is_single_answer': False,
  'answers': [{'lexicalized': 'English',
    'qnodes': ['Q1860'],
    'gt_surprise': 0.5,
    'truth_value_wd': True,
    'truth_value_fact_checked': True},
   {'lexicalized': 'German',
    'qnodes': ['Q188'],
    'gt_surprise': 0.2692307692307692,
    'truth_value_wd': True,
    'truth_value_fact_checked': True},
   {'lexicalized': 'Russian',
    'qnodes': ['Q7737'],
    'gt_surprise': 0.9615384615384616,
    'truth_value_wd': True,
    'truth_value_fact_checked': True},
   {'lexicalized': 'Swedish',
    'qnodes': ['Q9027'],
    'gt_surprise': 0.23076923076923078,
    'truth_value_wd': True,
    'truth_value_fact_checked': True},
   {'lexicalized': 'Spanish',
    'qnodes': ['Q1321'],
    'gt_surprise': 0.0,
    'truth_value_wd': False,
    'truth_value_fact_checke

## Save processed data to json

### saving our version where numeric answers have a qnode key that gives the string that is in our profiles

In [57]:
with open('mc_trivia_surprise_data.with_numeric_profile_qnodes.json', 'w+') as f:
    json.dump(questions, f)

### Remove qnode key from numeric answers as this will not make sense for other consumers of this data

In [58]:
for q in questions:
    if q["is_numeric_answer"]:
        for answer in q["answers"]:
            del answer["qnodes"]

In [60]:
with open('mc_trivia_surprise_data.json', 'w+') as f:
    json.dump(questions, f)