This notebook loads the Wikidata-mc-trivia-71 dataset and evaluates our methods of surprise ranking on it

## requirements
* tqdm (`pip install tqdm`)
* gensim (`pip install gensim`)
* kgtk (follow documentation here: https://kgtk.readthedocs.io/en/latest/install/)

In [1249]:
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, TranslationDynamicOperator, \
                                DotComparator, CosComparator, L2Comparator
import json
from utility import kgtk_to_dataframe
from scipy.spatial import distance
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics.pairwise import cosine_similarity
import random
from adjustText import adjust_text
import glob
import itertools
from collections import Counter, defaultdict
from functools import reduce, lru_cache
from gensim.models import KeyedVectors

In [1145]:
# path to the benchmark data
surprise_data_file = "./benchmark_data/mc_trivia_surprise_data.with_numeric_profile_qnodes.json"

# output path
work_dir = "./output/quiz_task"
# path where kypher db file will be saved
store_dir = f"{work_dir}/temp"
# Wikidata claims.wikibase-item file. We only need claims about humans for this dataset, so using a filtered file.
item_file = "./input_data/wikidata-20210215-dwd.claims.wikibase-item.q5.tsv.gz"
# Wikidata labels.en file.
label_file = "./input_data/labels.en.tsv.gz"
# paths to profile_labels_info_joined.RELs_and_AILs.tsv and entity_profile_labels.RELs_and_AILs.shuffled.tsv
profile_labels_file = "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/explainability/profile_labels_info_joined.RELs_and_AILs.tsv"
ent_to_profiles_file = "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/explainability/entity_profile_labels.RELs_and_AILs.shuffled.tsv"


"""
Paths to embedding folders...

TODO for reproducing results:
We don't store the embeddings on github because they are large.
To include them in evaluation when running this notebook, you need to download them
from google drive (location specified on github), and specify their locations below.
"""
emb_locations = {
    # path to wikidata-20211027-dwd-v3.transe-embeddings folder
    "transe": "/data02/profiling/wikidata-20211027-dwd-v3.transe-embeddings",
    # path to wikidata-20210215-dwd.profile-transe-embeddings folder
    "profile-transe": "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/profile_graph_embeddings/output",
    # path to wikidata-20210215-dwd-v2.complex-embeddings folder
    "complex": "/data02/profiling/wikidata-20210215-dwd-v2.complex-embeddings",
    # path to wikidata-20210215-dwd.profile-transe-embeddings folder
    "profile-complex": "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/profile_graph_embeddings/complex_04292022/output",
    # path to "Random walk embeddings" folder
    "random_walk": "/data02/profiling/Random\ walk\ embeddings",
    # path to text_emb_subsets folder
    "text_emb_subsets": "/data02/profiling/dwd-v3.class_subsets"
    
}

### Process params / set up variables

In [4]:
# Ensure paths are absolute
work_dir = os.path.abspath(work_dir)
store_dir = os.path.abspath(store_dir)
label_file = os.path.abspath(label_file)
    
# Create directories
if not os.path.exists(store_dir):
    os.makedirs(store_dir)
    
# adding some environment variables we'll be using frequently
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(store_dir)
os.environ['LABELS'] = label_file

# set up embedding file locations
# info for embedding models that we want to use for link prediction.
lp_embedding_models_info = {
    "transe": {"base_dir": emb_locations["transe"],
               "model_v_num": "v600",
               "operator": "translation",
               "dim": 100
              },
    "profile-transe": {"base_dir": emb_locations["profile-transe"],
               "model_v_num": "v100",
               "operator": "translation",
               "dim": 100
              },
    "complex": {"base_dir": emb_locations["complex"],
               "model_v_num": "v600",
               "operator": "complex_diagonal",
               "dim": 100
              },
    "profile-complex": {"base_dir": emb_locations["profile-complex"],
               "model_v_num": "v100",
               "operator": "complex_diagonal",
               "dim": 100
              },
}

kv_embedding_files = {"H" : f"{emb_locations[random_walk]}/h_embeddings_5x8,min_count=21.kv",
                   "A" : f"{emb_locations[random_walk]}/a_embeddings_10x10,min_count=0.kv",
                   "S" : f"{emb_locations[random_walk]}/s_embeddings_5x10,min_count=0.kv",
                  }

# kv_embedding_files = {"H" : "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/H_walks_analysis/h_embeddings_5x8,min_count=21.kv",
#                    "A" : "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/A_walks_analysis/a_embeddings_10x10,min_count=0.kv",
#                    "S" : "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/S_walks_analysis/s_embeddings_5x10,min_count=0.kv",
#                   }

### Load various things: profile labels, english labels, embeddings

Load profile labels (reusing profiles created by "Generating Explainable Abstractions for Wikidata Entities")

In [5]:
profile_labels_df = pd.read_csv(profile_labels_file, sep='\t', dtype=str).fillna("")

add english labels to profiles

In [7]:
labels_en_df = pd.read_csv(label_file, sep='\t')
labels_en_dict = dict(zip(labels_en_df.node1, labels_en_df.node2))
def remove_lang_tag(label):
    return label[1:-4]
# Note this code assumes we are only using AILs and RELs
plab_labels = []
for _, row in tqdm(profile_labels_df.iterrows(), total=len(profile_labels_df)):
    type_label = remove_lang_tag(row["type_label"])
    property_label = remove_lang_tag(row["property_label"])
    if row["node2"] != "":
        if row["node2"] in labels_en_dict:
            value_label = remove_lang_tag(labels_en_dict[row["node2"]])
        else:
            value_label = row["node2"]
    else:
        lb = row["lower_bound"]
        ub = row["upper_bound"]
        value_label = f"{lb}-{ub}"
        si = row["si_units"]
        wd = row["wd_units"]
        if wd != "":
            if wd in labels_en_dict:
                wd = remove_lang_tag(labels_en_dict[wd])
            value_label = value_label + f" {wd}"
        elif si != "":
            value_label = value_label + f" {si}"
    plab_labels.append(f"{type_label}, {property_label}, {value_label}")
profile_labels_df["plab_label"] = plab_labels

In [None]:
# Trim profile_labels_df to just columns we need
profile_labels_df = profile_labels_df.loc[:,["id", "plab_label", "support"]]

set up dictionaries for profile-label to entities and vice-versa to speed things up.

In [15]:
%%time
entity_prof_labels_df = pd.read_csv(ent_to_profiles_file, sep='\t')

CPU times: user 2min 2s, sys: 23.6 s, total: 2min 25s
Wall time: 2min 27s


In [17]:
%time ent_to_labels_dict = entity_prof_labels_df.groupby('node1')['node2'].apply(list).to_dict()

CPU times: user 10min 39s, sys: 39.7 s, total: 11min 19s
Wall time: 11min 18s


In [19]:
%time label_to_ents_dict = entity_prof_labels_df.groupby('node2')['node1'].apply(list).to_dict()

CPU times: user 1min 25s, sys: 7.19 s, total: 1min 32s
Wall time: 1min 32s


In [20]:
del entity_prof_labels_df

#### Load embeddings

In [21]:
embedding_models = {}

embeddings that we have link prediction files for (complex, transe, profile-complex, profile-transe)

In [1123]:
for lp_emb_name, model_info_dict in lp_embedding_models_info.items():
    print(lp_emb_name)
    base_dir = model_info_dict["base_dir"]
    model_v_num = model_info_dict["model_v_num"]
    entity_names_list = json.load(open(f"{base_dir}/entity_names_all_0.json"))

    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        embeddings = hf["embeddings"][...]

    embedding_models[lp_emb_name] = {}
    for i in tqdm(range(len(entity_names_list))):
        embedding_models[lp_emb_name][entity_names_list[i]] = embeddings[i]

transe


  0%|          | 0/55471746 [00:00<?, ?it/s]

profile-transe


  0%|          | 0/26894849 [00:00<?, ?it/s]

complex


  0%|          | 0/53002670 [00:00<?, ?it/s]

In [1146]:
for lp_emb_name, model_info_dict in lp_embedding_models_info.items():
    if lp_emb_name in embedding_models:
        continue
    print(lp_emb_name)
    base_dir = model_info_dict["base_dir"]
    model_v_num = model_info_dict["model_v_num"]
    entity_names_list = json.load(open(f"{base_dir}/entity_names_all_0.json"))

    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        embeddings = hf["embeddings"][...]

    embedding_models[lp_emb_name] = {}
    for i in tqdm(range(len(entity_names_list))):
        embedding_models[lp_emb_name][entity_names_list[i]] = embeddings[i]

profile-complex


  0%|          | 0/26894849 [00:00<?, ?it/s]

PCA text embeddings

In [24]:
pca_text_emb_file = "/data02/profiling/dwd-v3.text-embeddings.PCA100/faiss_index/kgtk_text_embeddings_all.PCA100.tsv"
text_emb_df = pd.read_csv(pca_text_emb_file, sep='\t')
text_emb_dict = {}
for _, row in tqdm(text_emb_df.iterrows()):
    ent = row["node1"]
    embed = np.float32(row["node2"].split(','))
    text_emb_dict[ent] = embed

0it [00:00, ?it/s]

In [25]:
embedding_models["pca100_text"] = text_emb_dict

Original text embeddings (subset)

In [26]:
orig_embed_subsets_dir = "/data02/profiling/dwd-v3.class_subsets"

In [27]:
orig_embed_dict = {}
for filename in glob.glob(f"{orig_embed_subsets_dir}/*"):
    print(f"loading from file {filename}")
    embedding_df = pd.read_csv(filename, sep='\t')
    for _, row in tqdm(embedding_df.iterrows()):
        ent = row["node1"]
        embed = np.float32(row["node2"].split(','))
        orig_embed_dict[ent] = embed

loading from file /data02/profiling/dwd-v3.class_subsets/Q5.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q3624078.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q532.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q23442.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q783794.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q3305213.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q11424.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q7725634.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q571.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q47461344.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q4830453.tsv.gz


0it [00:00, ?it/s]

In [28]:
embedding_models["text1024"] = orig_embed_dict

random walk embeddings

In [1141]:
%%time
for name, file_path in tqdm(kv_embedding_files.items()):
    print("now loading {} embeddings".format(name))
    emb_dict = {}
    kv_model = KeyedVectors.load(file_path)
    for key, index in tqdm(kv_model.key_to_index.items()):
        emb_dict[key] = kv_model.vectors[index]
    embedding_models[name] = emb_dict

  0%|          | 0/3 [00:00<?, ?it/s]

now loading H embeddings


  0%|          | 0/19593942 [00:00<?, ?it/s]

now loading A embeddings


  0%|          | 0/12106870 [00:00<?, ?it/s]

now loading S embeddings


  0%|          | 0/39030788 [00:00<?, ?it/s]

CPU times: user 4min 14s, sys: 1min 31s, total: 5min 46s
Wall time: 6min 30s


## Methods for computing measure of surprise

In [1142]:
def get_entity_profile_labels_set(ent, ent_to_labels_dict=None):
    if ent_to_labels_dict is not None and ent in ent_to_labels_dict:
        return set(ent_to_labels_dict[ent])
    res = !kgtk query -i {ent_to_profiles_file} --graph-cache $STORE \
          --match 'profile_labels: (ent)-[]->(profile_label_id)' \
          --return 'distinct profile_label_id' \
          --where 'ent = "{ent}"'
    return set(kgtk_to_dataframe(res).loc[:,"node2"])
    
def get_entity_profile_labels_df(profile_labels_df, ent, ent_to_labels_dict=None):
    labels = get_entity_profile_labels_set(ent, ent_to_labels_dict)
    return profile_labels_df.loc[profile_labels_df.loc[:,"id"].isin(labels),:]

def get_entities_with_profile_label(label_id, ent_to_omit=None, limit=1000, label_to_ents_dict=None):
    if label_to_ents_dict is not None and label_id in label_to_ents_dict:
        ents = list(label_to_ents_dict[label_id])
        if ent_to_omit is not None and ent_to_omit in ents:
            ents.remove(ent_to_omit)
        return set(np.random.choice(ents, min(len(ents),limit), replace=False))
    res = !kgtk query -i {ent_to_profiles_file} --graph-cache $STORE \
        --match 'profile_labels: (ent)-[]->(profile_label_id)' \
        --return 'distinct ent' \
        --where 'ent != "{ent_to_omit}" AND profile_label_id = "{label_id}"' \
        --limit {limit}
    return set(kgtk_to_dataframe(res).loc[:,"node1"])

def get_ents_of_type(ent_to_omit, ent_type, limit=1000):
    res = !kgtk query -i {ent_to_profiles_file} --graph-cache $STORE \
        --match 'profile_labels: (ent)-[]->(profile_label_id)' \
        --return 'distinct ent' \
        --where 'ent != "{ent_to_omit}" AND printf("%.{len(ent_type)}s", profile_label_id) = "{ent_type}"' \
        --limit {limit}
    return set(kgtk_to_dataframe(res).loc[:,"node1"])

def compute_surprise_metrics_for_sample(ent, sample_ents, embedding_dict, pairwise_disp_args,
                                        pairwise_sample=10000):
    ret = {}
    
    # get embeddings
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    ent_embed = embedding_dict[ent]

    # centroid-based measures
    centroid = np.mean(sample_embeds, axis=0)
    dists = np.array([distance.cosine(centroid, e) for e in sample_embeds])
    avg_sample_to_centroid = np.mean(dists)
    ret["dispersion (centroid)"] = avg_sample_to_centroid
    ent_to_centroid = distance.cosine(centroid, ent_embed)
    ret["distance (centroid)"] = ent_to_centroid
    ret["distance(centroid)/dispersion(centroid)"] = ent_to_centroid / avg_sample_to_centroid

    # avg sample dist to entity of interest
    ent_dists = np.array([distance.cosine(ent_embed, e) for e in sample_embeds])
    avg_ent_to_sample = np.mean(ent_dists)
    ret["distance (avg pairwise)"] = avg_ent_to_sample
    ret["distance(avg pairwise)/dispersion(centroid)"] = avg_ent_to_sample / avg_sample_to_centroid

    # avg pairwise dist within sample
    avg_sample_to_sample = compute_avg_pairwise_dist_in_sample(pairwise_disp_args["fact_ids"],
                                                               pairwise_disp_args["class"],
                                                               pairwise_disp_args["emb_name"],
                                                               pairwise_sample
                                                              )
    ret["dispersion (avg pairwise)"] = avg_sample_to_sample
    ret["distance(avg pairwise)/dispersion(avg pairwise)"] = avg_ent_to_sample / avg_sample_to_sample
        
    return ret

@lru_cache(maxsize=None)
def compute_avg_pairwise_dist_in_sample(fact_ids, ent_class, emb_name, pairwise_sample=10000):
    """
    Assumptions:
        * label_to_ents_dict exists and is accessible here
        * embedding_models exists and is accessible here
    """
    # first get sample
    if fact_ids is not None:
        assert ent_class is None, "One of fact_ids or ent_class should be None."
        fact_ids = fact_ids.split("|")
        sample_ents = set()
        for fact in fact_ids:
            sample_ents = sample_ents | get_entities_with_profile_label(fact, "", pairwise_sample, label_to_ents_dict)
    elif ent_class is not None:
        sample_ents = get_ents_of_type("", ent_class, pairwise_sample)
    else:
        assert False, "Both fact_ids and ent_class are None"
        
    # choose embedding model we are using
    embedding_dict = embedding_models[emb_name]
    
    # get embeddings
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    # avg pairwise dist within sample
    sample_dists = []
    for i in range(pairwise_sample):
        e1, e2 = sample_embeds[np.random.choice(sample_embeds.shape[0], size=2, replace=False), :]
#         e1, e2 = random.sample(list(sample_embeds), 2) # Slower
        sample_dists.append(distance.cosine(e1, e2))
    avg_sample_to_sample = np.mean(sample_dists)
    return avg_sample_to_sample

def compute_avg_dist_from_ent_to_sample(ent, sample_ents, embedding_dict):
    # get embeddings
    ent_embed = embedding_dict[ent]
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    # avg sample dist to entity of interest
    ent_dists = np.array([distance.cosine(ent_embed, e) for e in sample_embeds])
    avg_ent_to_sample = np.mean(ent_dists)
    return avg_ent_to_sample

def compute_surprise_metrics_for_ent_fact(ent, fact_ids, emb_name, embedding_dict, label_to_ents_dict=None,
                                          sample=10000, pairwise_sample=10000):
    # fact_ids are used to form the sample by taking union of samples for each fact
    ents_sharing_label = set()
    for fact in fact_ids:
        ents_sharing_label = ents_sharing_label | get_entities_with_profile_label(fact, ent, sample, label_to_ents_dict)
    
    # Information needed to get a sample of embeddings
    # When calculating pairwise dispersion of sample,
    # we'll cache the result and use these values as keys.
    pairwise_disp_args = {"fact_ids": "|".join(fact_ids),
                          "class": None,
                          "emb_name": emb_name
                         }
    metrics_dict = compute_surprise_metrics_for_sample(ent, ents_sharing_label, embedding_dict, pairwise_disp_args, pairwise_sample)
        
    return metrics_dict

def compute_surprise_metrics_for_df(ent, facts_df, embedding_models, label_to_ents_dict=None, sample=10000, pairwise_sample=10000):
    label_ids = facts_df.loc[:,"id"]
    
    # for each profile label, sample entities and compute surprise metrics with each embedding model
    metrics_dict = {}
    for label_id in tqdm(label_ids):
        ents_sharing_label = get_entities_with_profile_label(label_id, ent, sample, label_to_ents_dict)
        # Information needed to get a sample of embeddings
        # When calculating pairwise dispersion of sample,
        # we'll cache the result and use these values as keys.
        pairwise_disp_args = {"fact_ids": label_id,
                              "class": None,
                             }
        for name, embedding_dict in embedding_models.items():
            pairwise_disp_args["emb_name"] = name
            label_metrics = compute_surprise_metrics_for_sample(ent, ents_sharing_label, embedding_dict, pairwise_disp_args, pairwise_sample)
            for k, v in label_metrics.items():
                emb_specific_key = f"{k} - {name}"
                if emb_specific_key not in metrics_dict:
                    metrics_dict[emb_specific_key] = []
                metrics_dict[emb_specific_key].append(v)
        
    for k, v in metrics_dict.items():
        facts_df.loc[:,k] = v
    
    return facts_df

def compute_surprise_metrics_sampling_by_type(ent, ent_type, embedding_models,
                                              sample=10000, pairwise_sample=10000):
    ents_sharing_type = get_ents_of_type(ent, ent_type, sample)
    # Information needed to get a sample of embeddings
    # When calculating pairwise dispersion of sample,
    # we'll cache the result and use these values as keys.
    pairwise_disp_args = {"fact_ids": None,
                          "class": ent_type,
                         }
    metrics_dict = {}
    for name, embedding_dict in embedding_models.items():
        pairwise_disp_args["emb_name"] = name
        class_metrics = compute_surprise_metrics_for_sample(ent, ents_sharing_type, embedding_dict,
                                                            pairwise_disp_args, pairwise_sample)
        metrics_dict[name] = class_metrics
        
    return metrics_dict

# Baselines for measuring surprise
def get_surprise_scores_random(fact_ids):
    return np.random.rand(len(fact_ids))

def get_surprise_scores_freq(fact_ids, profile_labels_df):
    scores = []
    for f_ids in fact_ids:
        # handle multiple fact ids (see "canvas" answer in mc quiz)
        freq = 0
        for f_id in f_ids:
            freq += float(profile_labels_df.loc[profile_labels_df["id"].values == f_id, "support"])
        scores.append(1-freq)
    return scores

# updating and displaying avg question correlation
def update_corr_measures(questions):
    for q in questions:
        ans_surprise_gts = [ans["gt_surprise"] for ans in q["answers"]]
        for method_name in q["answers"][0]["method_surprise_scores"]:
            ans_surprise_preds = [ans["method_surprise_scores"][method_name] for ans in q["answers"]]
            rho, _ = spearmanr(ans_surprise_gts, ans_surprise_preds)
            tau, _ = kendalltau(ans_surprise_gts, ans_surprise_preds)
            q["method_spearman"][method_name] = rho
            q["method_kendalltau"][method_name] = tau
            
def display_avg_question_corr(question_subsets, method_names):
    header = [""]
    for name in question_subsets:
        header.append(f"Rho ({name})")
        header.append(f"Tau ({name})")
    rows = []
    for method_name in method_names:
        row = [method_name]
        for question_subset in question_subsets.values():
            rhos = []
            taus = []
            for q in question_subset:
                # check if not all questions in subset can be evaluated with this method
                if method_name not in q["method_spearman"]:
                    break
                rhos.append(q["method_spearman"][method_name])
                taus.append(q["method_kendalltau"][method_name])
            # if can't eval all questions in subset with this method, don't evaluate on it
            if len(rhos) != len(question_subset):
                row.extend(["-", "-"])
            else:
                r = np.mean(rhos)
                t = np.mean(taus)
                row.append(f"{r:.3f}")
                row.append(f"{t:.3f}")
        rows.append(row)

    df = pd.DataFrame(rows, columns=header)
    display(df)

## Load MC quiz data

In [975]:
with open(surprise_data_file, 'r') as f:
    questions = json.load(f)

Add profile label ids for each answer and an empty dict for storing computed method surprise scores

In [976]:
all_profile_labels = set(profile_labels_df.loc[:,"id"])
# reconstructing and validating profile labels for each question / answer
for i, q in enumerate(questions):
    for answer in q["answers"]:
        ans_fact_ids = []
        for qnode in answer["qnodes"]:
            fact_id = "{}_{}_{}".format(q["class"], q["property"], qnode)
            if q["wd_units"] is not None:
                fact_id += "__" + q["wd_units"]
            assert fact_id in all_profile_labels, f"{fact_id} not found in loaded profile labels\n" +\
                f"Question {i}: \'{q['lexicalized']}\'"
            ans_fact_ids.append(fact_id)
        answer["fact_ids"] = ans_fact_ids
        # also initialize dict for storing computed surprise scores
        answer["method_surprise_scores"] = {}
    q["method_spearman"] = {}
    q["method_kendalltau"] = {}

## Correlation of facts within single question

### Baselines

In [998]:
num_trials = 500

for q in tqdm(questions):
    
    ans_surprise_gts = [ans["gt_surprise"] for ans in q["answers"]]
    fact_ids = [ans["fact_ids"] for ans in q["answers"]]
    
    # random
    q["method_spearman"]["random"] = []
    q["method_kendalltau"]["random"] = []
    for i in range(num_trials):
        random_preds = get_surprise_scores_random(fact_ids)
        r, r_pval = spearmanr(ans_surprise_gts, random_preds)
        t, t_pval = kendalltau(ans_surprise_gts, random_preds)
        q["method_spearman"]["random"].append(r)
        q["method_kendalltau"]["random"].append(t)
    q["method_spearman"]["random"] = np.mean(q["method_spearman"]["random"])
    q["method_kendalltau"]["random"] = np.mean(q["method_kendalltau"]["random"])
    
    # freq
    freq_preds = get_surprise_scores_freq(fact_ids, profile_labels_df)
    # if freqs are all the same, fall back to random.
    if all(freq_preds[0] == np.array(freq_preds)):
        q["method_spearman"]["frequency"] = q["method_spearman"]["random"]
        q["method_kendalltau"]["frequency"] = q["method_kendalltau"]["random"]
    else:
        r, r_pval = spearmanr(ans_surprise_gts, freq_preds)
        t, t_pval = kendalltau(ans_surprise_gts, freq_preds)
        q["method_spearman"]["frequency"] = r
        q["method_kendalltau"]["frequency"] = t
header = ["", "Spearman", "KT"]
rows = []
for method in ["random", "frequency"]:
    r = np.mean([q["method_spearman"][method] for q in questions])
    t = np.mean([q["method_kendalltau"][method] for q in questions])
    rows.append([method,
                 f"{r:.3f}",
                 f"{t:.3f}",
                ])
df = pd.DataFrame(rows, columns=header)
display(df)

  0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,Unnamed: 1,Spearman,KT
0,random,0.003,0.003
1,frequency,0.066,0.074


Now splitting by different question subsets

In [1004]:
qnode_questions = [q for q in questions if not q["is_numeric_answer"]]
numeric_questions = [q for q in questions if q["is_numeric_answer"]]
single_ans_questions = [q for q in questions if q["is_single_answer"]]
multi_ans_questions = [q for q in questions if not q["is_single_answer"]]

In [1079]:
question_subsets = {"qnode": qnode_questions,
                    "num": numeric_questions,
                    "all": questions,
                    "single": single_ans_questions,
                    "multi": multi_ans_questions,
                   }
header = [""]
for name in question_subsets:
    header.append(f"Rho ({name})")
    header.append(f"Tau ({name})")

In [1077]:
rows = []
for method in ["random", "frequency"]:
    row = [method]
    for question_subset in question_subsets.values():
        r = np.mean([q["method_spearman"][method] for q in question_subset])
        t = np.mean([q["method_kendalltau"][method] for q in question_subset])
        row.append(f"{r:.3f}")
        row.append(f"{t:.3f}")
    rows.append(row)
df = pd.DataFrame(rows, columns=header)
display(df)

Unnamed: 0,Unnamed: 1,Rho (qnode),Tau (qnode),Rho (num),Tau (num),Rho (single),Tau (single),Rho (multi),Tau (multi),Rho (all),Tau (all)
0,random,-0.003,-0.002,0.024,0.019,0.023,0.019,-0.005,-0.003,0.003,0.003
1,frequency,0.043,0.055,0.134,0.129,0.108,0.095,0.049,0.065,0.066,0.074


### Statistical methods

In [1147]:
surprise_metric_abbrevs = {'distance(avg pairwise)/dispersion(avg pairwise)': "ap/ap",
                           'distance(centroid)/dispersion(centroid)': "c/c",
                           'distance(avg pairwise)/dispersion(centroid)': "ap/c"
                          }

for q, answer in tqdm([(q, answer) for q in questions for answer in q["answers"]]):
    ent = q["entity"]
    fact_ids = answer["fact_ids"]
    for emb_name, embedding_dict in embedding_models.items():
        # following if block with avoid recomputing already-computed methods
        if f"{emb_name}, ap/ap" in answer["method_surprise_scores"]:
            continue
        metrics_dict = compute_surprise_metrics_for_ent_fact(ent, fact_ids, emb_name, embedding_dict, label_to_ents_dict)
        for metric_name, abbrev in surprise_metric_abbrevs.items():
            method_name = f"{emb_name}, {abbrev}"
            answer["method_surprise_scores"][method_name] = metrics_dict[metric_name]

  0%|          | 0/118 [00:00<?, ?it/s]

Compute per-question correlation and view avg correlation in question

In [1148]:
update_corr_measures(questions)
display_avg_question_corr(question_subsets, questions[0]["method_spearman"])      

Unnamed: 0,Unnamed: 1,Rho (qnode),Tau (qnode),Rho (num),Tau (num),Rho (all),Tau (all),Rho (single),Tau (single),Rho (multi),Tau (multi)
0,random,-0.003,-0.002,0.024,0.019,0.003,0.003,0.023,0.019,-0.005,-0.003
1,frequency,0.043,0.055,0.134,0.129,0.066,0.074,0.108,0.095,0.049,0.065
2,"complex, ap/ap",0.54,0.446,0.415,0.364,0.508,0.425,0.473,0.418,0.523,0.428
3,"complex, c/c",0.556,0.468,0.455,0.382,0.531,0.447,0.507,0.433,0.541,0.452
4,"complex, ap/c",0.551,0.457,0.355,0.271,0.502,0.410,0.422,0.338,0.535,0.44
5,"pca100_text, ap/ap",0.476,0.411,0.424,0.340,0.463,0.393,0.429,0.337,0.476,0.416
6,"pca100_text, c/c",0.49,0.411,0.424,0.340,0.473,0.393,0.429,0.337,0.492,0.416
7,"pca100_text, ap/c",0.431,0.361,0.297,0.247,0.397,0.333,0.321,0.257,0.429,0.364
8,"text1024, ap/ap",0.552,0.48,0.449,0.375,0.526,0.454,0.510,0.427,0.533,0.465
9,"text1024, c/c",0.574,0.502,0.490,0.394,0.553,0.475,0.544,0.443,0.557,0.488


## Link prediction method

In [1212]:
def load_lp_embedding_model(base_dir, model_v_num, operator, dim):
    relation_names_list = json.load(open(f"{base_dir}/dynamic_rel_names.json"))
    entity_names_list = json.load(open(f"{base_dir}/entity_names_all_0.json"))
#     print("creating entity-to-index dict...")
    entity_to_index = {}
    for i, entity in enumerate(entity_names_list):
        entity_to_index[entity] = i

#     print("creating relation-to-index dict...")
    rel_index = {}
    for i, rel in enumerate(relation_names_list):
        rel_index[rel] = i

    prop_count = len(relation_names_list)

    # operators
    if operator == "complex_diagonal":
        operator_lhs = ComplexDiagonalDynamicOperator(dim, prop_count)
        operator_rhs = ComplexDiagonalDynamicOperator(dim, prop_count)
    elif operator == "translation":
        operator_lhs = TranslationDynamicOperator(dim, prop_count)
        operator_rhs = TranslationDynamicOperator(dim, prop_count)
    else:
        assert False

    with h5py.File(f"{base_dir}/model/model.{model_v_num}.h5", "r") as hf:
        if operator == "complex_diagonal":
            operator_state_dict_lhs = {
                "real": torch.from_numpy(hf["model/relations/0/operator/lhs/real"][...]),
                "imag": torch.from_numpy(hf["model/relations/0/operator/lhs/imag"][...]),
            }
            operator_state_dict_rhs = {
                "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
                "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
            }
        elif operator == "translation":
            operator_state_dict_lhs = {
                "translations": torch.from_numpy(hf["model/relations/0/operator/lhs/translations"][...]),
            }
            operator_state_dict_rhs = {
                "translations": torch.from_numpy(hf["model/relations/0/operator/rhs/translations"][...]),
            }
        else:
            assert False

#     print("loading operator state...")
    operator_lhs.load_state_dict(operator_state_dict_lhs)
    operator_rhs.load_state_dict(operator_state_dict_rhs)
        
    return [operator_lhs, operator_rhs, entity_to_index, rel_index]

In [1232]:
# %%time
def get_lp_scores(src_ent, dest_ents, edge, entity_to_index, rel_index,
                  base_dir, model_v_num, dim, comparator, operator, is_lhs):

    src_offset = entity_to_index[src_ent]
    dest_offsets = [[entity_to_index[e] for e in synonyms] for synonyms in dest_ents]
    # src_offset = entity_names_list.index(src_ent)
    # dest_offsets = [entity_names_list.index(e) for e in dest_ents]

    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        src_embedding = torch.from_numpy(hf["embeddings"][src_offset, :])
        dest_embeddings = [torch.stack([torch.from_numpy(hf["embeddings"][syn_offset, :]) for syn_offset in synonym_offsets]) for synonym_offsets in dest_offsets]
    
    # Calculate the scores
    scores = []
    for syn_embeddings in dest_embeddings:
        if is_lhs:
            syn_scores, _, _ = comparator(
            comparator.prepare(
                operator(
                    src_embedding,
                    torch.tensor(rel_index[edge]),
                ).expand(1, len(syn_embeddings), dim),
            ),
            comparator.prepare(syn_embeddings.view(1, len(syn_embeddings), dim)),
            torch.empty(1, 0, dim), # Left-hand side negatives, not needed
            torch.empty(1, 0, dim), # Right-hand side negatives, not needed
            )
        else:
            syn_scores, _, _ = comparator(
                comparator.prepare(src_embedding.view(1, 1, dim)).expand(1, len(syn_embeddings), dim),
                comparator.prepare(
                    operator(
                        syn_embeddings,
                        torch.tensor([rel_index[edge]]).expand(len(syn_embeddings)),
                    ).view(1, len(syn_embeddings), dim),
                ),
                torch.empty(1, 0, dim), # Left-hand side negatives, not needed
                torch.empty(1, 0, dim), # Right-hand side negatives, not needed
            )
        scores.append(np.mean(syn_scores.detach().numpy()))
    return scores

q = questions[0]
src_ent = q["entity"]
dest_ents = [ans["qnodes"] for ans in q["answers"]]
edge = q["property"]
scores = get_lp_scores(src_ent, dest_ents, edge, entity_to_index, rel_index,
                  base_dir, model_v_num, dim, comparator, operator_lhs, is_lhs=True)
for i in range(len(dest_ents)):
    print(f'{q["answers"][i]["lexicalized"]}: {1-scores[i]}')
    


English: -2.0973997116088867
German: -3.60689640045166
Russian: -11.97774600982666
Swedish: 0.3592963218688965
Spanish: 7.68614387512207


In [1250]:
comparators = {"dot": DotComparator(), "cos": CosComparator(), "l2": L2Comparator()}

for lp_emb_name, model_info_dict in lp_embedding_models_info.items():
    # following if block will avoid recomputing already-computed methods
#     if f"LP-{lp_emb_name}" in questions[0]["answers"][0]["method_surprise_scores"]:
#         continue
            
    print(lp_emb_name)
    # load lp embedding model
    base_dir = model_info_dict["base_dir"]
    model_v_num = model_info_dict["model_v_num"]
    operator = model_info_dict["operator"]
    dim = model_info_dict["dim"]
    [operator_lhs, operator_rhs, entity_to_index, rel_index] = \
        load_lp_embedding_model(base_dir, model_v_num, operator, dim)
    for q in questions:
        src_ent = q["entity"]
        edge = q["property"]
        if "profile" in lp_emb_name:
            edge += "_profile"
        if edge not in rel_index:
            print(f'\t{edge} not in embeddings. Skipping question: {q["lexicalized"]}')
            continue
        value_key = "fact_ids" if "profile" in lp_emb_name else "qnodes"
        dest_ents = [ans[value_key] for ans in q["answers"]]
        # get LP scores for each comparator and lhs/rhs
        for comparator_name, comparator in comparators.items():
            lhs_scores = get_lp_scores(src_ent, dest_ents, edge, entity_to_index, rel_index,
                                       base_dir, model_v_num, dim, comparator, operator_lhs, is_lhs=True)
            rhs_scores = get_lp_scores(src_ent, dest_ents, edge, entity_to_index, rel_index,
                                       base_dir, model_v_num, dim, comparator, operator_rhs, is_lhs=False)
            # save method scores for each answer
            for i, answer in enumerate(q["answers"]):
                answer["method_surprise_scores"][f"LP-{lp_emb_name}-lhs-{comparator_name}"] = 1 - lhs_scores[i]
                answer["method_surprise_scores"][f"LP-{lp_emb_name}-rhs-{comparator_name}"] = 1 - rhs_scores[i]

transe
	P1971 not in embeddings. Skipping question: How many children does Arnold Schwarzenegger have?
	P2067 not in embeddings. Skipping question: What is Donald Trump's mass in pounds circa. 2019?
	P2250 not in embeddings. Skipping question: What is the life expectancy in years of Australia circa. 2016?
	P3001 not in embeddings. Skipping question: What is the retirement age in Colombia? The answer for either men or women will be accepted.
	P2927 not in embeddings. Skipping question: What percentage of the territory of Canada inside its coast line and international boundaries is water?
	P2139 not in embeddings. Skipping question: What was the total revenue in euros of the business "Adidas" circa. 2014?
profile-transe
complex
	P1971 not in embeddings. Skipping question: How many children does Arnold Schwarzenegger have?
	P2067 not in embeddings. Skipping question: What is Donald Trump's mass in pounds circa. 2019?
	P2250 not in embeddings. Skipping question: What is the life expectancy

## view avg correlation in question

In [1252]:
update_corr_measures(questions)
display_avg_question_corr(question_subsets, questions[0]["method_spearman"])

Unnamed: 0,Unnamed: 1,Rho (qnode),Tau (qnode),Rho (num),Tau (num),Rho (all),Tau (all),Rho (single),Tau (single),Rho (multi),Tau (multi)
0,random,-0.003,-0.002,0.024,0.019,0.003,0.003,0.023,0.019,-0.005,-0.003
1,frequency,0.043,0.055,0.134,0.129,0.066,0.074,0.108,0.095,0.049,0.065
2,"complex, ap/ap",0.54,0.446,0.415,0.364,0.508,0.425,0.473,0.418,0.523,0.428
3,"complex, c/c",0.556,0.468,0.455,0.382,0.531,0.447,0.507,0.433,0.541,0.452
4,"complex, ap/c",0.551,0.457,0.355,0.271,0.502,0.410,0.422,0.338,0.535,0.44
5,"pca100_text, ap/ap",0.476,0.411,0.424,0.340,0.463,0.393,0.429,0.337,0.476,0.416
6,"pca100_text, c/c",0.49,0.411,0.424,0.340,0.473,0.393,0.429,0.337,0.492,0.416
7,"pca100_text, ap/c",0.431,0.361,0.297,0.247,0.397,0.333,0.321,0.257,0.429,0.364
8,"text1024, ap/ap",0.552,0.48,0.449,0.375,0.526,0.454,0.510,0.427,0.533,0.465
9,"text1024, c/c",0.574,0.502,0.490,0.394,0.553,0.475,0.544,0.443,0.557,0.488


## Auto-ML supervised link prediction models
Hayden-todo: update header description here if necessary

In [None]:
# Hayden-todo: fill this out
def get_automl_lp_top1_emb(entity, prop):
    """
    get embedding predictions for the object the given subject-property pair
        entity: qnode (string)
        prop: pnode (string)
        return: the embedding of the top 1 prediction
    """
    pass
    
# Hayden-todo: fill this out
def get_automl_emb(obj):
    """
    get embeddings for the target object of each fact.
        obj: qnode (string)
        return: the embedding for the given entity obj
    """
    pass

Hayden-todo: after filling out the above functions, run the below cell to compute scores for your method

In [None]:
for q in questions:
    src_ent = q["entity"]
    edge = q["property"]
    dest_ents = [ans[qnodes] for ans in q["answers"]]
    
    # get embedding prediction question
    automl_pred_emb = get_automl_lp_top1_emb(src_ent, edge)
    
    # get scores for each answer in the question
    scores = []
    for syn_ents in dest_ents: # each answer can have more than one synonym qnode
        # get embeddings for each target object synonym
        automl_target_embs = [get_automl_emb(obj) for obj in syn_ents]
        syn_scores = [distance.cosine(pre, target) for pred in automl_target_embs]
        scores.append(np.mean(syn_scores))

   # save method scores for each answer
    for i, answer in enumerate(q["answers"]):
        answer["method_surprise_scores"]["automl-LP"] = scores[i]

## view avg correlation in question
Hayden-todo: run below cell to view results

In [None]:
update_corr_measures(questions)
display_avg_question_corr(question_subsets, questions[0]["method_spearman"])

### Other evaluationg methods besides avg correlation within each question...
Pausing on this for now. Just tried some statistical method
#### Correlation of facts across all questions (NO normalization)

In [742]:
max_count = 26 # 26 test takers
emb_name = 'text1024'
surprise_metric = 'distance(avg pairwise)/dispersion(avg pairwise)'
single_ans_gts = []
single_ans_preds = []
multi_ans_gts = []
multi_ans_preds = []
for q in questions:
    answers = list(q["pred_counts"].keys())
    if q["single_answer"]:
        single_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        single_ans_preds.extend([q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric] for ans in answers])
    else:
        multi_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        multi_ans_preds.extend([q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric] for ans in answers])

print("correlation across all facts (no normalization)")
print("\tmulti-answer questions only")
rho, rho_pval = spearmanr(multi_ans_gts, multi_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts, multi_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tsingle-answer questions only")
rho, rho_pval = spearmanr(single_ans_gts, single_ans_preds)
tau, tau_pval = kendalltau(single_ans_gts, single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tall questions")
rho, rho_pval = spearmanr(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")

correlation across all facts (no normalization)
	multi-answer questions only
		Spearman: 0.274, pval=0.011
		KT: 0.182, pval=0.016
	single-answer questions only
		Spearman: 0.294, pval=0.097
		KT: 0.213, pval=0.092
	all questions
		Spearman: 0.142, pval=0.125
		KT: 0.086, pval=0.175


**Hypothesis:** Using # of people who chose the answer to infer surprise/unexpectedness will not be directly comparable between single and multi-answer questions. In multi-answer questions, every answer can have up to the total number of participants choose it, making it similar to if we had asked each answer as a separate true/false question. Meanwhile, in single-answer questions, picking one answer means the participant cannot choose any other answers. This could mean that even if no answer is very surprising, some answers may have very few people choose it simply because there was a similar answer that seemed like a safer bet.

**observation:** The above result supports this hypothesis.

**What to do about this:** Either evaluate single and multi answers separately or come up with another surprise-score-inference method that is more comparable across the two kinds of questions.

Let's check if the effect we are seeing above is due to the decrease in number of facts being compared...

In [743]:
print(f"len(single_ans_gts): {len(single_ans_gts)}")
print(f"len(multi_ans_gts): {len(multi_ans_gts)}")

len(single_ans_gts): 33
len(multi_ans_gts): 85


In [744]:
spearmanr(multi_ans_gts[:33] + single_ans_gts, multi_ans_preds[:33] + single_ans_preds)

SpearmanrResult(correlation=-0.03149116526040072, pvalue=0.8018067539647361)

In [745]:
spearmanr(multi_ans_gts[:33], multi_ans_preds[:33])

SpearmanrResult(correlation=0.17886051056359697, pvalue=0.3192897690462346)

In [746]:
spearmanr(single_ans_gts,single_ans_preds)

SpearmanrResult(correlation=0.2939105369545495, pvalue=0.09687748220540489)

Safe to say it is not due to decrease in size.

#### Correlation of facts across all questions (with normalization)

Start by gathering samples for each class we are dealing with

In [699]:
# ent_class_to_sample = {}
# for q in tqdm(questions):
#     ent_class = q["class"]
#     if ent_class not in ent_class_to_sample:
#         ent_class_to_sample[ent_class] = get_ents_of_type("", ent_class, limit=10000)

  0%|          | 0/24 [00:00<?, ?it/s]

Now we can compute dispersion of each class once

In [None]:
# class_dispersion_dict = {}
# for ent_class, sample_ents in tqdm(ent_class_to_sample.items()):
#     class_dispersion_dict[ent_class] = compute_avg_pairwise_dist_in_sample(sample_ents, embedding_dict, pariwise_sample=10000)

# compute_avg_dist_from_ent_to_sample(ent, sample_ents, embedding_dict)

ignore above, changed functions to use lru cache

In [730]:
for q in tqdm(questions):
    q["class_surprise_metrics_dict"] = compute_surprise_metrics_sampling_by_type(q["entity"], q["class"], embedding_models)

  0%|          | 0/24 [00:00<?, ?it/s]

In [748]:
max_count = 26 # 26 test takers
emb_name = 'text1024'
surprise_metric = 'distance(avg pairwise)/dispersion(avg pairwise)'
single_ans_gts = []
single_ans_preds = []
multi_ans_gts = []
multi_ans_preds = []
for q in questions:
    answers = list(q["pred_counts"].keys())
    ans_surprise_gts.extend([max_count - count for count in q["pred_counts"].values()])
    q_ans_surprise_preds = []
    surprise_of_ent_in_class = q["class_surprise_metrics_dict"][emb_name][surprise_metric]
    for ans in answers:
        unnormalized_surprise = q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric]
        q_ans_surprise_preds.append(unnormalized_surprise / surprise_of_ent_in_class)
    if q["single_answer"]:
        single_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        single_ans_preds.extend(q_ans_surprise_preds)
    else:
        multi_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        multi_ans_preds.extend(q_ans_surprise_preds)

print("correlation across all facts (WITH normalization)")
print("\tmulti-answer questions only")
rho, rho_pval = spearmanr(multi_ans_gts, multi_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts, multi_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tsingle-answer questions only")
rho, rho_pval = spearmanr(single_ans_gts, single_ans_preds)
tau, tau_pval = kendalltau(single_ans_gts, single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tall questions")
rho, rho_pval = spearmanr(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")

correlation across all facts (WITH normalization)
	multi-answer questions only
		Spearman: 0.354, pval=0.001
		KT: 0.246, pval=0.001
	single-answer questions only
		Spearman: 0.191, pval=0.287
		KT: 0.150, pval=0.235
	all questions
		Spearman: 0.186, pval=0.044
		KT: 0.122, pval=0.055


**Observation:** As I expected, normalization helps with comparing facts about different entities when looking at multi-answer questions. However, it appears to hurt on the single answer questions. Why could this be?? We do have a much larger p-value for the single answer correlation here, so maybe it is by chance.

**Note** I do not think we can say that normalization helps in general here since the increase in correlation on "all-questions" could be due to the higher number of multi-answer question facts.

Look at what normalized values look like

In [736]:
for q in sorted(questions, key=lambda q: q["spearman"][emb_name][surprise_metric]):
    answers = list(q["pred_counts"].keys())
    ans_surprise_gts = [max_count - count for count in q["pred_counts"].values()]
    ans_surprise_preds = [q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric] for ans in answers]
    surprise_of_ent_in_class = q["class_surprise_metrics_dict"][emb_name][surprise_metric]
    ans_surprise_preds_norm = [s / surprise_of_ent_in_class for s in ans_surprise_preds]
    gt_order = np.argsort(ans_surprise_gts)
    rows = [
        [""] + [answers[i] for i in gt_order],
        ["gt surprise"] + [str(ans_surprise_gts[i]) for i in gt_order],
        ["pred surprise"] + [f"{ans_surprise_preds[i]:.2f}" for i in gt_order],
        ["normalized pred"] + [f"{ans_surprise_preds_norm[i]:.2f}" for i in gt_order]
    ]
    display(pd.DataFrame(rows[1:], columns=rows[0]))
    print(f"spearman r: {q['spearman'][emb_name][surprise_metric]}")

Unnamed: 0,Unnamed: 1,Singer-songwriter,Dancer,Entrepreneur,Model,Voice Actor
0,gt surprise,1.0,7.0,9.0,11.0,14.0
1,pred surprise,1.48,1.32,1.6,1.12,1.47
2,normalized pred,1.19,1.06,1.28,0.9,1.18


spearman r: -0.3


Unnamed: 0,Unnamed: 1,Painter,Engineer,Chemist,Zoologist,Diplomat
0,gt surprise,0.0,5.0,14.0,19.0,20.0
1,pred surprise,1.66,1.17,1.32,1.36,1.21
2,normalized pred,1.66,1.17,1.32,1.36,1.22


spearman r: -0.3


Unnamed: 0,Unnamed: 1,> 240,> 210 and <= 240,> 195 and <= 210,> 1 and <= 181,> 181 and <= 195
0,gt surprise,15.0,15.0,22.0,26.0,26.0
1,pred surprise,2.82,1.39,1.38,1.65,1.45
2,normalized pred,2.01,0.99,0.99,1.18,1.03


spearman r: 0.0


Unnamed: 0,Unnamed: 1,Amateur Wrestling,Boxing,Rugby,Baseball,American Football
0,gt surprise,8.0,17.0,22.0,23.0,24.0
1,pred surprise,4.3,2.17,2.57,2.73,3.74
2,normalized pred,3.85,1.94,2.3,2.44,3.35


spearman r: 0.0


Unnamed: 0,Unnamed: 1,Republican Party,Democratic Party,Communist Party of the Soviet Union,National Socialist German Workers' Party,Social Democratic Party of Germany
0,gt surprise,0.0,10.0,26.0,26.0,26.0
1,pred surprise,1.6,1.7,1.76,1.52,1.83
2,normalized pred,1.14,1.21,1.25,1.08,1.31


spearman r: 0.3354101966249684


Unnamed: 0,Unnamed: 1,Basketball Player,Television Actor,Writer,Screenwriter,Researcher
0,gt surprise,0.0,13.0,22.0,23.0,25.0
1,pred surprise,1.91,1.61,1.66,1.69,3.91
2,normalized pred,1.67,1.41,1.45,1.48,3.42


spearman r: 0.39999999999999997


Unnamed: 0,Unnamed: 1,62.5 to 65.0,55.0 to 60.0,60.0 to 62.5,65.25 to 67.0
0,gt surprise,16.0,19.0,21.0,22.0
1,pred surprise,0.86,1.0,1.01,0.87
2,normalized pred,0.98,1.14,1.15,1.0


spearman r: 0.39999999999999997


Unnamed: 0,Unnamed: 1,Romantic Comedy,Fantasy Film,Film Based on a Novel,Drama,Musical Film
0,gt surprise,8.0,10.0,15.0,23.0,23.0
1,pred surprise,1.02,1.09,1.08,1.11,1.06
2,normalized pred,0.93,1.0,0.98,1.01,0.97


spearman r: 0.46169025843831935


Unnamed: 0,Unnamed: 1,8.4 to 27.9,2.6 to 5.7,1.5 to 2.5,0.3 to 1.4,0.0 to 0.2
0,gt surprise,15.0,15.0,24.0,25.0,25.0
1,pred surprise,1.37,1.09,1.13,1.16,1.62
2,normalized pred,1.1,0.88,0.91,0.94,1.31


spearman r: 0.47434164902525683


Unnamed: 0,Unnamed: 1,Voice Actor,Musician,Politician,Writer,Chess Player
0,gt surprise,4.0,16.0,22.0,23.0,25.0
1,pred surprise,1.19,1.81,1.64,1.54,5.48
2,normalized pred,1.07,1.62,1.47,1.38,4.9


spearman r: 0.6


Unnamed: 0,Unnamed: 1,"438,000,000 to 1,590,000,000","1,610,000,000 to 6,745,000,000","113,000,000 to 427,800,000","6,764,000,000 to 217,267,000,000","1 to 108,589,000"
0,gt surprise,15.0,20.0,21.0,22.0,26.0
1,pred surprise,0.9,0.87,0.85,0.95,0.98
2,normalized pred,0.75,0.72,0.7,0.79,0.81


spearman r: 0.6


Unnamed: 0,Unnamed: 1,Singer,Politician,Film Director,Architect,Sport Cyclist
0,gt surprise,0.0,13.0,18.0,26.0,26.0
1,pred surprise,1.83,1.88,1.82,2.0,3.38
2,normalized pred,1.41,1.45,1.4,1.54,2.6


spearman r: 0.6668859288553503


Unnamed: 0,Unnamed: 1,76.6 to 85.4,73.9 to 76.6,70.2 to 73.8,63.3 to 69.9,51.8 to 63.2
0,gt surprise,18.0,18.0,19.0,23.0,26.0
1,pred surprise,1.11,1.12,1.11,1.28,1.44
2,normalized pred,1.12,1.12,1.11,1.28,1.44


spearman r: 0.6668859288553503


Unnamed: 0,Unnamed: 1,Television Presenter,Writer,Television Actor,Film Actor,Film Producer
0,gt surprise,4.0,9.0,14.0,16.0,19.0
1,pred surprise,1.17,1.64,1.23,1.37,1.78
2,normalized pred,0.96,1.35,1.0,1.13,1.46


spearman r: 0.7


Unnamed: 0,Unnamed: 1,History Painting,Cityscape,Landscape Art,Portrait,Self-Portrait
0,gt surprise,6.0,19.0,21.0,23.0,26.0
1,pred surprise,0.82,0.78,0.8,0.86,1.39
2,normalized pred,1.08,1.04,1.06,1.14,1.84


spearman r: 0.7


Unnamed: 0,Unnamed: 1,Russian,English,German,Swedish,Spanish
0,gt surprise,1.0,13.0,19.0,20.0,26.0
1,pred surprise,1.14,1.12,1.16,1.43,1.32
2,normalized pred,0.98,0.96,1.0,1.23,1.14


spearman r: 0.7999999999999999


Unnamed: 0,Unnamed: 1,Politician,Military Officer,Painter,Rugby Union Player,Singer
0,gt surprise,1.0,12.0,16.0,24.0,26.0
1,pred surprise,1.61,1.45,2.25,2.66,2.34
2,normalized pred,1.26,1.14,1.76,2.09,1.84


spearman r: 0.7999999999999999


Unnamed: 0,Unnamed: 1,Landscape,Sky,Mountain,Virgin Mary,Bridge
0,gt surprise,10.0,11.0,15.0,18.0,21.0
1,pred surprise,1.15,1.09,1.22,1.29,1.23
2,normalized pred,1.01,0.95,1.06,1.13,1.07


spearman r: 0.7999999999999999


Unnamed: 0,Unnamed: 1,Oil Paint,Canvas,Cardboard,Paper,Tempera
0,gt surprise,0.0,5.0,24.0,24.0,25.0
1,pred surprise,1.6,1.63,1.7,1.85,1.72
2,normalized pred,1.0,1.02,1.06,1.15,1.07


spearman r: 0.8207826816681234


Unnamed: 0,Unnamed: 1,London,New York City,Hamburg,Paris,Rome
0,gt surprise,8.0,20.0,24.0,26.0,26.0
1,pred surprise,1.2,1.35,1.34,1.37,1.54
2,normalized pred,0.95,1.07,1.06,1.09,1.22


spearman r: 0.8720815992723809


Unnamed: 0,Unnamed: 1,Singer-songwriter,Film Producer,Entrepreneur,Author,Painter
0,gt surprise,1.0,12.0,14.0,19.0,26.0
1,pred surprise,1.54,1.59,1.55,1.94,2.14
2,normalized pred,1.23,1.27,1.24,1.55,1.71


spearman r: 0.8999999999999998


Unnamed: 0,Unnamed: 1,English,French,German,Russian,Swedish
0,gt surprise,2.0,12.0,15.0,24.0,25.0
1,pred surprise,1.11,1.29,1.23,1.44,1.47
2,normalized pred,0.96,1.12,1.07,1.25,1.27


spearman r: 0.8999999999999998


Unnamed: 0,Unnamed: 1,Switzerland,South Africa,United States of America,France,South Korea
0,gt surprise,9.0,18.0,20.0,20.0,26.0
1,pred surprise,1.07,1.1,1.16,1.19,1.31
2,normalized pred,1.03,1.06,1.11,1.15,1.26


spearman r: 0.9746794344808963


Unnamed: 0,Unnamed: 1,4,3,2 or fewer,5 or more
0,gt surprise,16.0,19.0,20.0,23.0
1,pred surprise,0.97,0.99,0.99,0.99
2,normalized pred,0.82,0.84,0.84,0.84


spearman r: 1.0


#### Correlation of true facts (according to WD)

#### Correlation of true facts (according to Google)