This notebook loads the Wikidata-Survey-FunFacts-71 dataset and evaluates our methods of surprise ranking on it

## requirements
* tqdm (`pip install tqdm`)
* gensim (`pip install gensim`)
* kgtk (follow documentation here: https://kgtk.readthedocs.io/en/latest/install/)

In [3]:
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, TranslationDynamicOperator, \
                                DotComparator, CosComparator, L2Comparator
import json
from utility import kgtk_to_dataframe
from scipy.spatial import distance
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics.pairwise import cosine_similarity
import random
from collections import Counter, defaultdict
from functools import reduce, lru_cache
from gensim.models import KeyedVectors
from math import comb
from itertools import combinations

## Parameters

You will probably need to update the absolute paths to various resources (mostly embedding files)

In [233]:
# path to the benchmark data
surprise_data_file = "./benchmark_data/funfacts_mapped_to_wd.tsv"

# output path
work_dir = "./output/trivia_task"
# path where kypher db file will be saved
store_dir = f"{work_dir}/temp"
# Wikidata claims.wikibase-item file. We only need claims about humans for this dataset, so using a filtered file.
item_file = "./input_data/wikidata-20210215-dwd.claims.wikibase-item.q5.tsv.gz"

"""
Paths to embedding folders...

TODO for reproducing results:
We don't store the embeddings on github because they are large.
To include them in evaluation when running this notebook, you need to download them
from google drive (location specified on github), and specify their locations below.
"""
emb_locations = {
    # path to wikidata-20211027-dwd-v3.transe-embeddings folder
    "transe": "/data02/profiling/wikidata-20211027-dwd-v3.transe-embeddings",
    # path to wikidata-20210215-dwd.profile-transe-embeddings folder
    "profile-transe": "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/profile_graph_embeddings/output",
    # path to wikidata-20210215-dwd-v2.complex-embeddings folder
    "complex": "/data02/profiling/wikidata-20210215-dwd-v2.complex-embeddings",
    # path to wikidata-20210215-dwd.profile-transe-embeddings folder
    "profile-complex": "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/profile_graph_embeddings/complex_04292022/output",
    # path to "Random walk embeddings" folder
    "random_walk": "/data02/profiling/Random\ walk\ embeddings",
    # path to text_emb_subsets folder
    "text_emb_subsets": "/data02/profiling/dwd-v3.class_subsets"
    
}

### Process params / set up variables

In [37]:
# Ensure paths are absolute
work_dir = os.path.abspath(work_dir)
store_dir = os.path.abspath(store_dir)
item_file = os.path.abspath(item_file)
    
# Create directories
if not os.path.exists(work_dir):
    os.makedirs(work_dir)
if not os.path.exists(store_dir):
    os.makedirs(store_dir)
    
# adding some environment variables we'll be using frequently
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(store_dir)
os.environ['WORK'] = work_dir
os.environ["ITEM"] = item_file

# set up embedding file locations
# info for embedding models that we want to use for link prediction.
lp_embedding_models_info = {
    "transe": {"base_dir": emb_locations["transe"],
               "model_v_num": "v600",
               "operator": "translation",
               "dim": 100
              },
    "profile-transe": {"base_dir": emb_locations["profile-transe"],
               "model_v_num": "v100",
               "operator": "translation",
               "dim": 100
              },
    "complex": {"base_dir": emb_locations["complex"],
               "model_v_num": "v600",
               "operator": "complex_diagonal",
               "dim": 100
              },
    "profile-complex": {"base_dir": emb_locations["profile-complex"],
               "model_v_num": "v100",
               "operator": "complex_diagonal",
               "dim": 100
              },
}

kv_embedding_files = {"H" : f"{emb_locations[random_walk]}/h_embeddings_5x8,min_count=21.kv",
                   "A" : f"{emb_locations[random_walk]}/a_embeddings_10x10,min_count=0.kv",
                   "S" : f"{emb_locations[random_walk]}/s_embeddings_5x10,min_count=0.kv",
                  }

# kv_embedding_files = {"H" : "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/H_walks_analysis/h_embeddings_5x8,min_count=21.kv",
#                    "A" : "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/A_walks_analysis/a_embeddings_10x10,min_count=0.kv",
#                    "S" : "/data02/profiling/kgtk/entity_profiling/output/wikidata-20210215-dwd/S_walks_analysis/s_embeddings_5x10,min_count=0.kv",
#                   }

text_emb_q5_file = f"{emb_location[text_emb_subsets]}/Q5.tsv.gz"

#### Load embeddings

In [5]:
embedding_models = {}

embeddings that we have link prediction files for (complex, transe, profile-complex, profile-transe)

In [6]:
for lp_emb_name, model_info_dict in lp_embedding_models_info.items():
    print(lp_emb_name)
    base_dir = model_info_dict["base_dir"]
    model_v_num = model_info_dict["model_v_num"]
    entity_names_list = json.load(open(f"{base_dir}/entity_names_all_0.json"))

    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        embeddings = hf["embeddings"][...]

    embedding_models[lp_emb_name] = {}
    for i in tqdm(range(len(entity_names_list))):
        embedding_models[lp_emb_name][entity_names_list[i]] = embeddings[i]

transe


  0%|          | 0/55471746 [00:00<?, ?it/s]

In [218]:
for lp_emb_name, model_info_dict in lp_embedding_models_info.items():
    if lp_emb_name in embedding_models:
        continue
    print(lp_emb_name)
    base_dir = model_info_dict["base_dir"]
    model_v_num = model_info_dict["model_v_num"]
    entity_names_list = json.load(open(f"{base_dir}/entity_names_all_0.json"))

    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        embeddings = hf["embeddings"][...]

    embedding_models[lp_emb_name] = {}
    for i in tqdm(range(len(entity_names_list))):
        embedding_models[lp_emb_name][entity_names_list[i]] = embeddings[i]

profile-transe


  0%|          | 0/26894849 [00:00<?, ?it/s]

complex


  0%|          | 0/53002670 [00:00<?, ?it/s]

profile-complex


  0%|          | 0/26894849 [00:00<?, ?it/s]

Text embeddings (human subset)

In [221]:
text_emb_dict = {}
print(f"loading from file {text_emb_q5_file}")
embedding_df = pd.read_csv(text_emb_q5_file, sep='\t')
for _, row in tqdm(embedding_df.iterrows()):
    ent = row["node1"]
    embed = np.float32(row["node2"].split(','))
    text_emb_dict[ent] = embed

loading from file /data02/profiling/dwd-v3.class_subsets/Q5.tsv.gz


0it [00:00, ?it/s]

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [222]:
embedding_models["text1024"] = text_emb_dict

random walk embeddings

In [234]:
%%time
for name, file_path in tqdm(kv_embedding_files.items()):
    print("now loading {} embeddings".format(name))
    emb_dict = {}
    kv_model = KeyedVectors.load(file_path)
    for key, index in tqdm(kv_model.key_to_index.items()):
        emb_dict[key] = kv_model.vectors[index]
    embedding_models[name] = emb_dict

  0%|          | 0/3 [00:00<?, ?it/s]

now loading H embeddings


  0%|          | 0/19593942 [00:00<?, ?it/s]

now loading A embeddings


  0%|          | 0/12106870 [00:00<?, ?it/s]

now loading S embeddings


  0%|          | 0/39030788 [00:00<?, ?it/s]

CPU times: user 3min 41s, sys: 2min 1s, total: 5min 43s
Wall time: 6min 42s


## Load task data

In [235]:
trivia_df = pd.read_csv(surprise_data_file, sep='\t').fillna("")

Not all rows can be evaluated as they can't call be mapped to WD.

In [236]:
eval_mask = [fact != "" and "_" not in fact for fact in trivia_df.loc[:,"fact.wd"]]

In [237]:
eval_trivia_df = trivia_df.loc[eval_mask]

In [238]:
started_mask = [fact != "" for fact in trivia_df.loc[:,"fact.wd"]]

In [239]:
cant_map_mask = [val == 1 for val in trivia_df.loc[:,"can't map to wd"]]

In [240]:
num_cant_map = sum(cant_map_mask)

In [241]:
print(f"# rows in original data: {len(trivia_df)}")
print(f"# rows started processing: {sum(1 for started, cant_map in zip(started_mask, cant_map_mask) if started or cant_map)}")
print(f"# rows finished processing: {len(eval_trivia_df) + num_cant_map}")
print(f"# rows mapped to WD: {len(eval_trivia_df)}")

# rows in original data: 362
# rows started processing: 183
# rows finished processing: 113
# rows mapped to WD: 71


In [314]:
len(eval_trivia_df.loc[:,"Article.wd"].unique())

47

In [242]:
ent_counts = Counter(eval_trivia_df.loc[:,"Article.wd"])

In [243]:
ents_by_count = defaultdict(list)
for ent, count in ent_counts.items():
    ents_by_count[count].append(ent)
print("Dist of number of rows we have for each entity:")
for count, ents in sorted(ents_by_count.items(), key=lambda pair: pair[0]):
    print(f"\t{count}: {len(ents)}")

Dist of number of rows we have for each entity:
	1: 28
	2: 15
	3: 3
	4: 1


## Compute surprise scores

In [None]:
if "method_to_scores" not in locals():
    method_to_scores = {}

### Statistical method

In [280]:
@lru_cache(maxsize=None)
def get_ents_with_fact(prop, val):
#     print(f"'{prop}'")
#     print(f"'{val}'")
    os.environ["PROP"] = prop
    os.environ["VAL"] = val
    res = !kgtk query -i $ITEM --graph-cache $STORE \
        --match 'item: (ent)-[id {label:prop}]->(val)' \
        --return 'distinct ent' \
        --where 'prop = "'$PROP'" AND val = "'$VAL'"'
#     if "node1" not in kgtk_to_dataframe(res).columns:
#         print(res)
    return set(kgtk_to_dataframe(res).loc[:,"node1"])
    
def handle_cent_facts(facts_to_intersect):
    # for now, just ignore these
    return [fact for fact in facts_to_intersect if "cent" not in fact]

@lru_cache(maxsize=None)
def get_population_for_compound_fact(fact_str):
    population = None
    # fact str is intersection of unions
    facts_to_intersect = fact_str.split(',')  # intersect on comma
    # replace "centx" notation with props/vals we want to use 
    facts_to_intersect = handle_cent_facts(facts_to_intersect)
    for facts_to_union in facts_to_intersect:  
        # gather union of entities with these facts
        fact_union_pop = set()
        for fact in facts_to_union.split('|'):  # union on pipe
            fact = fact.replace(" ", "") # ensure no spaces
            prop, val = fact.split("=")
            fact_union_pop = fact_union_pop | get_ents_with_fact(prop, val)
        # update overall population by intersecting with these ents
        if population is None:
            population = fact_union_pop
        else:
            population = population & fact_union_pop
        # check if population is empty
        if len(population) == 0:
            return population
    return population
        
def sample_by_compound_fact(fact_str, limit=10000):
    population = get_population_for_compound_fact(fact_str)
    if len(population) <= limit:
        return population
    else:
        return set(np.random.choice(list(population), limit, replace=False))
    
def get_pop_size_for_compound_fact(fact_str):
    return len(get_population_for_compound_fact(fact_str))
    
def compute_surprise_metrics_for_sample(ent, sample_ents, embedding_dict, pairwise_sample=10000):
    ret = {}
    
    # get embeddings
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    ent_embed = embedding_dict[ent]

    # centroid-based measures
    centroid = np.mean(sample_embeds, axis=0)
    dists = np.array([distance.cosine(centroid, e) for e in sample_embeds])
    avg_sample_to_centroid = np.mean(dists)
    ret["dispersion (centroid)"] = avg_sample_to_centroid
    ent_to_centroid = distance.cosine(centroid, ent_embed)
    ret["distance (centroid)"] = ent_to_centroid
    ret["distance(centroid)/dispersion(centroid)"] = ent_to_centroid / avg_sample_to_centroid

    # avg sample dist to entity of interest
    ent_dists = np.array([distance.cosine(ent_embed, e) for e in sample_embeds])
    avg_ent_to_sample = np.mean(ent_dists)
    ret["distance (avg pairwise)"] = avg_ent_to_sample
    ret["distance(avg pairwise)/dispersion(centroid)"] = avg_ent_to_sample / avg_sample_to_centroid

    # avg pairwise dist within sample
    # sample pairs if too many to stay under limit
    if comb(len(sample_embeds), 2) > pairwise_sample:
        sample_dists = []
        for i in range(pairwise_sample):
            e1, e2 = sample_embeds[np.random.choice(sample_embeds.shape[0], size=2, replace=False), :]
    #         e1, e2 = random.sample(list(sample_embeds), 2) # Slower
            sample_dists.append(distance.cosine(e1, e2))
    # if not too many samples, compute all pairs' dists
    else:
        sample_dists = [distance.cosine(e1, e2) for e1, e2 in combinations(sample_embeds, 2)]
        
    avg_sample_to_sample = np.mean(sample_dists)
    
    ret["dispersion (avg pairwise)"] = avg_sample_to_sample
    ret["distance(avg pairwise)/dispersion(avg pairwise)"] = avg_ent_to_sample / avg_sample_to_sample
        
    return ret

In [281]:
surprise_metric_abbrevs = {'distance(avg pairwise)/dispersion(avg pairwise)': "ap/ap",
                           'distance(centroid)/dispersion(centroid)': "c/c",
                           'distance(avg pairwise)/dispersion(centroid)': "ap/c"
                          }

stat_method_scores = defaultdict(list)
for _, row in tqdm(eval_trivia_df.iterrows(), total=len(eval_trivia_df)):
    entity = row["Article.wd"]
    fact_str = row["fact.wd"]
    sample = sample_by_compound_fact(fact_str)
    # don't include current entity in sample
    if entity in sample:
        sample.remove(entity)
    for emb_name, embedding_dict in embedding_models.items():
        # skip recomputing already computed methods
        if f"{emb_name}, c/c" in method_to_scores:
            continue
        # We can only compute surprise metrics if there are 2 or more samples (otherwise dispersion is undefined)
        if len(sample) > 1:
            metrics_dict = compute_surprise_metrics_for_sample(entity, sample, embedding_dict)
        for metric_name, abbrev in surprise_metric_abbrevs.items():
            # handle case where too few entities have this set of facts in WD for us to compute metrics
            score = np.inf if len(sample) < 2 else metrics_dict[metric_name]
            method_name = f"{emb_name}, {abbrev}"
            stat_method_scores[method_name].append(score)
            
method_to_scores.update(stat_method_scores)

  0%|          | 0/71 [00:00<?, ?it/s]

### Baselines

Random

In [138]:
method_to_scores["random-1"] = np.random.rand(len(eval_trivia_df))
method_to_scores["random-2"] = np.random.rand(len(eval_trivia_df))

In [320]:
trivia = np.array(eval_trivia_df.loc[:,"goodTrivia"])
surprise = np.array(eval_trivia_df.loc[:,"surprising"])
known = np.array(eval_trivia_df.loc[:,"knew"])

trivia_rho = []
surprise_rho = []
known_rho = []
trivia_tau = []
surprise_tau = []
known_tau = []

for i in range(1000):
    scores = np.random.rand(len(eval_trivia_df))
    trivia_rho.append(spearmanr(scores, trivia)[0])
    surprise_rho.append(spearmanr(scores, surprise)[0])
    known_rho.append(spearmanr(scores, known)[0])
    trivia_tau.append(kendalltau(scores, trivia)[0])
    surprise_tau.append(kendalltau(scores, surprise)[0])
    known_tau.append(kendalltau(scores, known)[0])
    
rows = [["random",
         f"{np.mean(trivia_rho):.3f}", f"{np.mean(trivia_tau):.3f}",
         f"{np.mean(surprise_rho):.3f}", f"{np.mean(surprise_tau):.3f}",
         f"{np.mean(known_rho):.3f}", f"{np.mean(known_tau):.3f}"]]

display(pd.DataFrame(rows, columns = ["Method",
                                      "rho-goodTrivia", "tau-goodTrivia",
                                      "rho-surprising", "tau-surprising",
                                      "rho-knew", "tau-knew"]))

Unnamed: 0,Method,rho-goodTrivia,tau-goodTrivia,rho-surprising,tau-surprising,rho-knew,tau-knew
0,random,0.0,0.001,-0.001,-0.0,-0.003,-0.002


Frequency

In [155]:
for _, row in tqdm(eval_trivia_df.iterrows(), total=len(eval_trivia_df)):
    fact_str = row["fact.wd"]
    pop_size = get_pop_size_for_compound_fact(fact_str)
    method_to_scores["frequency"].append(np.inf if pop_size == 0 else (1 / pop_size))

  0%|          | 0/71 [00:00<?, ?it/s]

### LP method

In [161]:
def load_lp_embedding_model(base_dir, model_v_num, operator, dim):
    relation_names_list = json.load(open(f"{base_dir}/dynamic_rel_names.json"))
    entity_names_list = json.load(open(f"{base_dir}/entity_names_all_0.json"))
#     print("creating entity-to-index dict...")
    entity_to_index = {}
    for i, entity in enumerate(entity_names_list):
        entity_to_index[entity] = i

#     print("creating relation-to-index dict...")
    rel_index = {}
    for i, rel in enumerate(relation_names_list):
        rel_index[rel] = i

    prop_count = len(relation_names_list)

    # operators
    if operator == "complex_diagonal":
        operator_lhs = ComplexDiagonalDynamicOperator(dim, prop_count)
        operator_rhs = ComplexDiagonalDynamicOperator(dim, prop_count)
    elif operator == "translation":
        operator_lhs = TranslationDynamicOperator(dim, prop_count)
        operator_rhs = TranslationDynamicOperator(dim, prop_count)
    else:
        assert False

    with h5py.File(f"{base_dir}/model/model.{model_v_num}.h5", "r") as hf:
        if operator == "complex_diagonal":
            operator_state_dict_lhs = {
                "real": torch.from_numpy(hf["model/relations/0/operator/lhs/real"][...]),
                "imag": torch.from_numpy(hf["model/relations/0/operator/lhs/imag"][...]),
            }
            operator_state_dict_rhs = {
                "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
                "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
            }
        elif operator == "translation":
            operator_state_dict_lhs = {
                "translations": torch.from_numpy(hf["model/relations/0/operator/lhs/translations"][...]),
            }
            operator_state_dict_rhs = {
                "translations": torch.from_numpy(hf["model/relations/0/operator/rhs/translations"][...]),
            }
        else:
            assert False

#     print("loading operator state...")
    operator_lhs.load_state_dict(operator_state_dict_lhs)
    operator_rhs.load_state_dict(operator_state_dict_rhs)
        
    return [operator_lhs, operator_rhs, entity_to_index, rel_index]

In [206]:
def lp_lhs(src_ent, edge, entity_to_index, rel_index,
           base_dir, model_v_num, dim, operator_lhs):
    src_offset = entity_to_index[src_ent]
    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        src_embedding = torch.from_numpy(hf["embeddings"][src_offset, :])
        
    return operator_lhs(src_embedding,
                    torch.tensor(rel_index[edge]),
                    ).detach().numpy()

def lp_rhs(edge, dest_ent, entity_to_index, rel_index,
           base_dir, model_v_num, dim, operator_rhs):
    dest_offset = entity_to_index[dest_ent]
    # Load the embeddings
    with h5py.File(f"{base_dir}/model/embeddings_all_0.{model_v_num}.h5", "r") as hf:
        dest_embedding = torch.from_numpy(hf["embeddings"][dest_offset, :])
    
    return operator_rhs(dest_embedding,
                    torch.tensor(rel_index[edge]),
                    ).detach().numpy()

In [307]:
lp_method_scores = defaultdict(list)

distance_funcs = {"cos": distance.cosine} #, "dot": lambda v1, v2: -np.dot(v1,v2)} # omitting for conciseness

for emb_name, model_info_dict in tqdm(lp_embedding_models_info.items()):
    # following if block will avoid recomputing already-computed methods
    if f"LP-{emb_name}-max-lhs-cos" in method_to_scores:
        continue
            
    print(emb_name)
    base_dir = model_info_dict["base_dir"]
    model_v_num = model_info_dict["model_v_num"]
    operator = model_info_dict["operator"]
    dim = model_info_dict["dim"]
    [operator_lhs, operator_rhs, entity_to_index, rel_index] = \
        load_lp_embedding_model(base_dir, model_v_num, operator, dim)
    embedding_dict = embedding_models[emb_name]
    
    for _, row in eval_trivia_df.iterrows():
        entity = row["Article.wd"]
        fact_str = row["fact.wd"].replace(" ", "") # ensure no spaces
        facts = fact_str.split(',')
        # replace "centx" notation with props/vals we want to use 
        facts = handle_cent_facts(facts)
        facts = [f for fact in facts for f in fact.split('|')]
        facts = [fact.split("=") for fact in facts]
        
        # if profile-graph, need to change prop and val accordingly
        if "profile" in emb_name:
            profile_facts = []
            for prop, val in facts:
                val = f"Q5_{prop}_{val}"
                prop = f"{prop}_profile"
                profile_facts.append((prop, val))
            facts = profile_facts

        # handle properties/values not in embeddings
        for prop, val in facts:
            if prop not in rel_index:
                print(f"{prop} not in {emb_name} embeddings")
            if val not in embedding_dict:
                print(f"{val} not in {emb_name} embeddings")
        facts = [(prop, val) for prop, val in facts if prop in rel_index and val in embedding_dict]
        if len(facts) == 0:
            # fill in nans for this so we know to skip or fill it in later
            for dist_name, dist_func in distance_funcs.items():
                # average dist (lhs and rhs)
                lp_method_scores[f"LP-{emb_name}-avg-lhs-{dist_name}"].append(np.nan)
                lp_method_scores[f"LP-{emb_name}-avg-rhs-{dist_name}"].append(np.nan)
                # max dist (lhs and rhs)
                lp_method_scores[f"LP-{emb_name}-max-lhs-{dist_name}"].append(np.nan)
                lp_method_scores[f"LP-{emb_name}-max-rhs-{dist_name}"].append(np.nan)
                # centroid dist (rhs)
                lp_method_scores[f"LP-{emb_name}-centroid-rhs-{dist_name}"].append(np.nan)
            continue

        lp_lhs_embs = [lp_lhs(entity, prop, entity_to_index, rel_index,
               base_dir, model_v_num, dim, operator_lhs) for prop, _ in facts]
        lhs_embs = [embedding_dict[val] for _, val in facts]

        lp_rhs_embs = [lp_rhs(prop, val, entity_to_index, rel_index,
               base_dir, model_v_num, dim, operator_rhs) for prop, val in facts]
        rhs_emb = embedding_dict[entity]

        for dist_name, dist_func in distance_funcs.items():
            # average dist (lhs and rhs)
            score = np.mean([dist_func(pred, target) for pred, target in zip(lp_lhs_embs, lhs_embs)])
            lp_method_scores[f"LP-{emb_name}-avg-lhs-{dist_name}"].append(score)
            score = np.mean([dist_func(pred, rhs_emb) for pred in lp_rhs_embs])
            lp_method_scores[f"LP-{emb_name}-avg-rhs-{dist_name}"].append(score)

            # max dist (lhs and rhs)
            score = np.max([dist_func(pred, target) for pred, target in zip(lp_lhs_embs, lhs_embs)])
            lp_method_scores[f"LP-{emb_name}-max-lhs-{dist_name}"].append(score)
            score = np.max([dist_func(pred, rhs_emb) for pred in lp_rhs_embs])
            lp_method_scores[f"LP-{emb_name}-max-rhs-{dist_name}"].append(score)

            # centroid dist (rhs)
            score = dist_func(rhs_emb, np.mean(lp_rhs_embs, axis=0))
            lp_method_scores[f"LP-{emb_name}-centroid-rhs-{dist_name}"].append(score)

method_to_scores.update(lp_method_scores)

  0%|          | 0/4 [00:00<?, ?it/s]

profile-transe
Q5_P106_Q10497074 not in profile-transe embeddings
Q5_P166_Q932650 not in profile-transe embeddings
Q5_P39_Q11696 not in profile-transe embeddings
Q5_P106_Q22662561 not in profile-transe embeddings
Q5_P106_Q465501 not in profile-transe embeddings
Q5_P69_Q860450 not in profile-transe embeddings
Q5_P27_P30 not in profile-transe embeddings
Q5_P106_Q12718299 not in profile-transe embeddings
Q5_P166_Q989442 not in profile-transe embeddings
Q5_P39_Q104238320 not in profile-transe embeddings
Q5_P166_Q3724813 not in profile-transe embeddings
Q5_P166_Q1320315 not in profile-transe embeddings
Q5_P69_Q2742632 not in profile-transe embeddings
Q5_P106_Q13381572 not in profile-transe embeddings
Q5_P140_Q9268 not in profile-transe embeddings
Q5_P106_Q16947675 not in profile-transe embeddings
Q5_P106_Q15982795 not in profile-transe embeddings
Q5_P106_Q23845879 not in profile-transe embeddings
Q5_P106_Q519076 not in profile-transe embeddings
Q5_P106_Q15295720 not in profile-transe embedd

## Auto-ML supervised link prediction models

Hayden-todo: update header description here if necessary

In [None]:
# Hayden-todo: fill this out
def get_automl_lp_top1_emb(entity, prop):
    """
    get embedding predictions for the object the given subject-property pair
        entity: qnode (string)
        prop: pnode (string)
        return: the embedding of the top 1 prediction
    """
    pass
    
# Hayden-todo: fill this out
def get_automl_emb(obj):
    """
    get embeddings for the target object of each fact.
        obj: qnode (string)
        return: the embedding for the given entity obj
    """
    pass

Hayden-todo: after filling out the above functions, run the below cell to compute scores for your method

In [None]:
automl_lp_method_scores = defaultdict(list)


for _, row in tqdm(eval_trivia_df.iterrows(), total=len(eval_trivia_df)):
    entity = row["Article.wd"]
    fact_str = row["fact.wd"].replace(" ", "") # ensure no spaces
    facts = fact_str.split(',')
    # replace "centx" notation with props/vals we want to use 
    facts = handle_cent_facts(facts)
    facts = [f for fact in facts for f in fact.split('|')]
    facts = [fact.split("=") for fact in facts]
    
    # Hayden-todo: if you want to change how the surprise score is calculated,
    # update the lines below function
    
    # get embedding predictions for the object of each fact
    automl_pred_embs = [get_automl_lp_top1_emb(entity, prop) for prop, _ in facts]
    # get embeddings for the target object of each fact
    automl_target_embs = [get_automl_emb(obj) for _, obj in facts]

    # two variations for aggregating distances below (avg and max)
    
    # average dist (lhs)
    score = np.mean([distance.cosine(pred, target) for pred, target in zip(automl_pred_embs, automl_target_embs)])
    automl_lp_method_scores[f"automl-LP-avg"].append(score)

    # max dist (lhs)
    score = np.max([distance.cosine(pred, target) for pred, target in zip(automl_pred_embs, automl_target_embs)])
    automl_lp_method_scores[f"automl-LP-max"].append(score)
            
# update overall method scores dict with automl method scores.
method_to_scores.update(automl_lp_method_scores)

## Compute correlation results
Computing correlation within each entity (only for entities that have multiple facts)

Hayden-todo: after running the above cells to compute scores for your methods, run the below cells to show results. The results we plan to show in the paper are the ones that measure correlation across all rows (so the cell 3 down from here).

In [298]:
entities = [ent for count, ents in ents_by_count.items() if count > 1 for ent in ents]

In [311]:
const_counts = defaultdict(int)

method_trivia_rho = defaultdict(list)
method_surprise_rho = defaultdict(list)
method_known_rho = defaultdict(list)
method_trivia_tau = defaultdict(list)
method_surprise_tau = defaultdict(list)
method_known_tau = defaultdict(list)
for ent in entities:
    idxs = [i for i, e in enumerate(eval_trivia_df.loc[:,"Article.wd"]) if ent==e]
    trivia = np.array(eval_trivia_df.iloc[idxs].loc[:,"goodTrivia"])
    surprise = np.array(eval_trivia_df.iloc[idxs].loc[:,"surprising"])
    known = np.array(eval_trivia_df.iloc[idxs].loc[:,"knew"])
    
    trivia_is_const = np.all(trivia[0] == trivia)
    surprise_is_const = np.all(surprise[0] == surprise)
    known_is_const = np.all(known[0] == known)
    
    if trivia_is_const:
        const_counts["trivia"] += 1
    if surprise_is_const:
        const_counts["surprise"] += 1
    if known_is_const:
        const_counts["known"] += 1
    
    for method_name, scores in method_to_scores.items():
        ent_scores = []
        for i in idxs:
            if not np.isnan(scores[i]):
                ent_scores.append(scores[i])
            else:
                # fallback to frequency
                ent_scores.append(method_to_scores["frequency"][i])
        ent_scores = np.array(ent_scores)
        ent_scores_is_const = np.all(ent_scores[0] == ent_scores)
        if ent_scores_is_const:
            const_counts[method_name] += 1
            continue
        if not trivia_is_const:
            method_trivia_rho[method_name].append(spearmanr(ent_scores, trivia)[0])
            method_trivia_tau[method_name].append(kendalltau(ent_scores, trivia)[0])
        if not surprise_is_const:
            method_surprise_rho[method_name].append(spearmanr(ent_scores, surprise)[0])
            method_surprise_tau[method_name].append(kendalltau(ent_scores, surprise)[0])
        if not known_is_const:
            method_known_rho[method_name].append(spearmanr(ent_scores, known)[0])
            method_known_tau[method_name].append(kendalltau(ent_scores, known)[0])

print("const counts:")
for name, count in const_counts.items():
    print(f"\t{name}: {count}")
    
print("number of samples evaluated:")
rows = []
for method_name in method_trivia_rho:
    trivia_count = len(method_trivia_rho[method_name])
    surprise_count = len(method_surprise_rho[method_name])
    known_count = len(method_known_rho[method_name])
    rows.append([method_name, trivia_count, surprise_count, known_count])

display(pd.DataFrame(rows, columns = ["Method", "goodTrivia", "surprising", "knew"]))

print("correlation scores:")
rows = []
for method_name in method_trivia_rho:
    trivia_rho = np.mean(method_trivia_rho[method_name])
    surprise_rho = np.mean(method_surprise_rho[method_name])
    known_rho = np.mean(method_known_rho[method_name])
    trivia_tau = np.mean(method_trivia_tau[method_name])
    surprise_tau = np.mean(method_surprise_tau[method_name])
    known_tau = np.mean(method_known_tau[method_name])
    rows.append([method_name,
                 f"{trivia_rho:.3f}", f"{trivia_tau:.3f}",
                 f"{surprise_rho:.3f}", f"{surprise_tau:.3f}",
                 f"{known_rho:.3f}", f"{known_tau:.3f}"])

display(pd.DataFrame(rows, columns = ["Method",
                                      "rho-goodTrivia", "tau-goodTrivia",
                                      "rho-surprising", "tau-surprising",
                                      "rho-knew", "tau-knew"]))

const counts:
	known: 5
	trivia: 2
	LP-profile-transe-max-lhs-cos: 5
	transe, ap/ap: 1
	transe, c/c: 1
	transe, ap/c: 1
	profile-transe, ap/ap: 1
	profile-transe, c/c: 1
	profile-transe, ap/c: 1
	complex, ap/ap: 1
	complex, c/c: 1
	complex, ap/c: 1
	profile-complex, ap/ap: 1
	profile-complex, c/c: 1
	profile-complex, ap/c: 1
	text1024, ap/ap: 1
	text1024, c/c: 1
	text1024, ap/c: 1
	H, ap/ap: 1
	H, c/c: 1
	H, ap/c: 1
	A, ap/ap: 1
	A, c/c: 1
	A, ap/c: 1
	S, ap/ap: 1
	S, c/c: 1
	S, ap/c: 1
	LP-transe-max-lhs-cos: 2
	LP-profile-transe-max-rhs-cos: 2
	LP-profile-complex-max-lhs-cos: 4
	LP-profile-complex-max-rhs-cos: 2
	surprise: 2
	LP-transe-max-rhs-cos: 1
	LP-complex-max-lhs-cos: 1
	LP-complex-max-rhs-cos: 1
number of samples evaluated:


Unnamed: 0,Method,goodTrivia,surprising,knew
0,"transe, ap/ap",16,16,13
1,"transe, c/c",16,16,13
2,"transe, ap/c",16,16,13
3,random-2,17,17,14
4,random-1,17,17,14
5,frequency,17,17,14
6,"profile-transe, ap/ap",16,16,13
7,"profile-transe, c/c",16,16,13
8,"profile-transe, ap/c",16,16,13
9,"complex, ap/ap",16,16,13


correlation scores:


Unnamed: 0,Method,rho-goodTrivia,tau-goodTrivia,rho-surprising,tau-surprising,rho-knew,tau-knew
0,"transe, ap/ap",0.181,0.167,-0.171,-0.183,0.615,0.615
1,"transe, c/c",0.4,0.396,0.071,0.055,0.472,0.476
2,"transe, ap/c",0.4,0.396,0.071,0.055,0.472,0.476
3,random-2,0.353,0.373,0.268,0.28,0.161,0.14
4,random-1,-0.024,-0.02,0.179,0.175,-0.387,-0.386
5,frequency,0.435,0.431,0.478,0.464,-0.49,-0.487
6,"profile-transe, ap/ap",0.219,0.229,-0.128,-0.123,0.591,0.567
7,"profile-transe, c/c",0.212,0.229,-0.117,-0.112,0.567,0.55
8,"profile-transe, ap/c",0.187,0.208,-0.136,-0.134,0.601,0.581
9,"complex, ap/ap",0.275,0.292,-0.14,-0.142,0.577,0.564


Computing correlation across all rows

In [312]:
method_trivia_rho = {}
method_surprise_rho = {}
method_known_rho = {}
method_trivia_tau = {}
method_surprise_tau = {}
method_known_tau = {}

trivia = np.array(eval_trivia_df.loc[:,"goodTrivia"])
surprise = np.array(eval_trivia_df.loc[:,"surprising"])
known = np.array(eval_trivia_df.loc[:,"knew"])

for method_name, scores in method_to_scores.items():
    if np.nan in scores:
        scores = np.array(scores)
        non_nan_idxs = [i for i, val in enumerate(scores) if not np.isnan(val)]
        # first the method without rows that are nan
        method_trivia_rho[f"{method_name} ({len(non_nan_idxs)} non-na)"] = \
            spearmanr(scores[non_nan_idxs], trivia[non_nan_idxs])[0]
        method_surprise_rho[f"{method_name} ({len(non_nan_idxs)} non-na)"] = \
            spearmanr(scores[non_nan_idxs], surprise[non_nan_idxs])[0]
        method_known_rho[f"{method_name} ({len(non_nan_idxs)} non-na)"] = \
            spearmanr(scores[non_nan_idxs], known[non_nan_idxs])[0]
        method_trivia_tau[f"{method_name} ({len(non_nan_idxs)} non-na)"] = \
            kendalltau(scores[non_nan_idxs], trivia[non_nan_idxs])[0]
        method_surprise_tau[f"{method_name} ({len(non_nan_idxs)} non-na)"] = \
            kendalltau(scores[non_nan_idxs], surprise[non_nan_idxs])[0]
        method_known_tau[f"{method_name} ({len(non_nan_idxs)} non-na)"] = \
            kendalltau(scores[non_nan_idxs], known[non_nan_idxs])[0]
        # now method with fallback to freq
        freq_scores = method_to_scores["frequency"]
        fallback_scores = [score if i in non_nan_idxs else fallback for i, (score,fallback) in enumerate(zip(scores, freq_scores))]
        method_trivia_rho[f"{method_name} + freq"] = spearmanr(fallback_scores, trivia)[0]
        method_surprise_rho[f"{method_name} + freq"] = spearmanr(fallback_scores, surprise)[0]
        method_known_rho[f"{method_name} + freq"] = spearmanr(fallback_scores, known)[0]
        method_trivia_tau[f"{method_name} + freq"] = kendalltau(fallback_scores, trivia)[0]
        method_surprise_tau[f"{method_name} + freq"] = kendalltau(fallback_scores, surprise)[0]
        method_known_tau[f"{method_name} + freq"] = kendalltau(fallback_scores, known)[0]
    else:
        method_trivia_rho[method_name] = spearmanr(scores, trivia)[0]
        method_surprise_rho[method_name] = spearmanr(scores, surprise)[0]
        method_known_rho[method_name] = spearmanr(scores, known)[0]
        method_trivia_tau[method_name] = kendalltau(scores, trivia)[0]
        method_surprise_tau[method_name] = kendalltau(scores, surprise)[0]
        method_known_tau[method_name] = kendalltau(scores, known)[0]
        
rows = []
for method_name in method_trivia_rho:
    trivia_rho = method_trivia_rho[method_name]
    surprise_rho = method_surprise_rho[method_name]
    known_rho = method_known_rho[method_name]
    trivia_tau = method_trivia_tau[method_name]
    surprise_tau = method_surprise_tau[method_name]
    known_tau = method_known_tau[method_name]
    rows.append([method_name,
                 f"{trivia_rho:.3f}", f"{trivia_tau:.3f}",
                 f"{surprise_rho:.3f}", f"{surprise_tau:.3f}",
                 f"{known_rho:.3f}", f"{known_tau:.3f}"])

display(pd.DataFrame(rows, columns = ["Method",
                                      "rho-goodTrivia", "tau-goodTrivia",
                                      "rho-surprising", "tau-surprising",
                                      "rho-knew", "tau-knew"]))

Unnamed: 0,Method,rho-goodTrivia,tau-goodTrivia,rho-surprising,tau-surprising,rho-knew,tau-knew
0,"transe, ap/ap",0.455,0.326,0.335,0.236,-0.165,-0.122
1,"transe, c/c",0.466,0.325,0.351,0.241,-0.215,-0.163
2,"transe, ap/c",0.442,0.31,0.34,0.237,-0.217,-0.163
3,random-2,0.002,0.005,0.044,0.03,0.055,0.042
4,random-1,-0.003,-0.003,0.057,0.047,-0.194,-0.138
5,frequency,0.369,0.27,0.369,0.268,-0.427,-0.319
6,"profile-transe, ap/ap",0.302,0.214,0.206,0.15,-0.038,-0.033
7,"profile-transe, c/c",0.35,0.247,0.249,0.173,-0.124,-0.092
8,"profile-transe, ap/c",0.349,0.248,0.263,0.19,-0.131,-0.093
9,"complex, ap/ap",0.414,0.294,0.306,0.206,-0.122,-0.088
