## requirements
* tensorflow
* adjustText (pip install adjustText)

In [737]:
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [725]:
import pandas as pd
import os
import numpy as np
from tqdm.notebook import tqdm
import faiss
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator, CosComparator
import json
from utility import kgtk_to_dataframe
from scipy.spatial import distance
from scipy.stats import spearmanr, kendalltau
from sklearn.metrics.pairwise import cosine_similarity
import random
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorboard.plugins import projector
from adjustText import adjust_text
import glob
import itertools
from collections import Counter, defaultdict
from functools import reduce, lru_cache

In [954]:
surprise_data_file = "/data02/profiling/surprising-facts/mc_trivia_surprise_data.with_numeric_profile_qnodes.json"

work_dir = "./output/wikidata-20210215-dwd"
store_dir = "./output/wikidata-20210215-dwd/temp-surprise"
# item_file = "./data/wikidata-20210215-dwd/claims.wikibase-item.tsv.gz"
label_file = "./data/wikidata-20210215-dwd/labels.en.tsv.gz"
# pagerank_file = "./data/wikidata-20210215-dwd/metadata.pagerank.directed.tsv.gz"
descriptions_file = "./data/wikidata-20210215-dwd/descriptions.en.tsv.gz"

# embedding_files = {
#                    "Profile-ComplEx" : "./output/wikidata-20210215-dwd/profile_graph_embeddings/profile_graph_embeddings.ComplEx.tsv",
#                    "Profile-TransE" : "./output/wikidata-20210215-dwd/profile_graph_embeddings/profile_graph_embeddings.TransE.tsv"
#                   }

complex_dir = "/data02/profiling/wikidata-20210215-dwd-v2.complex-embeddings"
vector_dimension = 100


### Process params / set up variables

In [4]:
# Ensure paths are absolute
work_dir = os.path.abspath(work_dir)
# store_dir = os.path.abspath(store_dir)
# item_file = os.path.abspath(item_file)
label_file = os.path.abspath(label_file)
descriptions_file = os.path.abspath(descriptions_file)
# for name, file_path in embedding_files.items():
#     embedding_files[name] = os.path.abspath(file_path)

# profile_graph_dir = "{}/label_formatting".format(work_dir)
    
# Create directories
output_dir = "{}/surprise_prediction".format(work_dir)
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
if not os.path.exists(store_dir):
    os.makedirs(store_dir)
    
# adding some environment variables we'll be using frequently
os.environ['STORE'] = "{}/wikidata.sqlite3.db".format(store_dir)
# os.environ['OUT'] = output_dir
# os.environ['PGRAPH'] = profile_graph_dir
os.environ['LABELS'] = label_file

### Load various things: profile labels, english labels, descriptions, embeddings

Load profile labels (reusing some work already done by another notebook here -- fun_fact_exploration)

In [5]:
profile_labels_df = pd.read_csv(f"{work_dir}/explainability/profile_labels_info_joined.RELs_and_AILs.tsv", sep='\t', dtype=str).fillna("")

In [6]:
label_distinctiveness_df = pd.read_csv(f"{work_dir}/explainability/profile_labels.RELs_and_AILs.distinctiveness.tsv", sep='\t')
label_distinctiveness_dict = dict(zip(label_distinctiveness_df.node1, label_distinctiveness_df.node2))

distinctivenesses = []
for _, row in tqdm(profile_labels_df.iterrows(), total=len(profile_labels_df)):
    label = row["id"]
    distinctivenesses.append(label_distinctiveness_dict[label])
profile_labels_df["distinctiveness"] = distinctivenesses

  0%|          | 0/621114 [00:00<?, ?it/s]

In [7]:
labels_en_df = pd.read_csv(label_file, sep='\t')

In [8]:
labels_en_dict = dict(zip(labels_en_df.node1, labels_en_df.node2))

In [9]:
descriptions_df = pd.read_csv(descriptions_file, sep='\t')

In [10]:
descriptions_dict = dict(zip(descriptions_df.node1, descriptions_df.node2))

In [11]:
def remove_lang_tag(label):
    return label[1:-4]

In [None]:
# Note this code assumes we are only using AILs and RELs
plab_labels = []
for _, row in tqdm(profile_labels_df.iterrows(), total=len(profile_labels_df)):
    type_label = remove_lang_tag(row["type_label"])
    property_label = remove_lang_tag(row["property_label"])
    if row["node2"] != "":
        if row["node2"] in labels_en_dict:
            value_label = remove_lang_tag(labels_en_dict[row["node2"]])
        else:
            value_label = row["node2"]
    else:
        lb = row["lower_bound"]
        ub = row["upper_bound"]
        value_label = f"{lb}-{ub}"
        si = row["si_units"]
        wd = row["wd_units"]
        if wd != "":
            if wd in labels_en_dict:
                wd = remove_lang_tag(labels_en_dict[wd])
            value_label = value_label + f" {wd}"
        elif si != "":
            value_label = value_label + f" {si}"
    plab_labels.append(f"{type_label}, {property_label}, {value_label}")
profile_labels_df["plab_label"] = plab_labels

In [None]:
# Trim profile_labels_df to just columns we need
profile_labels_df = profile_labels_df.loc[:,["id", "plab_label", "support", "distinctiveness"]]

set up dictionaries for profile-label to entities and vice-versa to speed things up.

In [15]:
%%time
entity_prof_labels_df = pd.read_csv(f"{work_dir}/explainability/entity_profile_labels.RELs_and_AILs.shuffled.tsv", sep='\t')


CPU times: user 2min 2s, sys: 23.6 s, total: 2min 25s
Wall time: 2min 27s


In [17]:
%time ent_to_labels_dict = entity_prof_labels_df.groupby('node1')['node2'].apply(list).to_dict()

CPU times: user 10min 39s, sys: 39.7 s, total: 11min 19s
Wall time: 11min 18s


In [19]:
%time label_to_ents_dict = entity_prof_labels_df.groupby('node2')['node1'].apply(list).to_dict()

CPU times: user 1min 25s, sys: 7.19 s, total: 1min 32s
Wall time: 1min 32s


In [20]:
del entity_prof_labels_df

#### Load embeddings

In [21]:
embedding_models = {}

complex

In [22]:
print("loading entity names list...")
entity_names_list = json.load(open(f"{complex_dir}/entity_names_all_0.json"))

# Load the embeddings
print("loading all embeddings...")
with h5py.File(f"{complex_dir}/model/embeddings_all_0.v600.h5", "r") as hf:
    embeddings = hf["embeddings"][...]

print("creating complex embedding dict...")
complex_embs={}
for i in tqdm(range(len(entity_names_list))):
    complex_embs[entity_names_list[i]] = embeddings[i]

loading entity names list...
loading all embeddings...
creating complex embedding dict...


  0%|          | 0/53002670 [00:01<?, ?it/s]

In [23]:
embedding_models["complex"] = complex_embs

PCA text embeddings

In [24]:
pca_text_emb_file = "/data02/profiling/dwd-v3.text-embeddings.PCA100/faiss_index/kgtk_text_embeddings_all.PCA100.tsv"
text_emb_df = pd.read_csv(pca_text_emb_file, sep='\t')
text_emb_dict = {}
for _, row in tqdm(text_emb_df.iterrows()):
    ent = row["node1"]
    embed = np.float32(row["node2"].split(','))
    text_emb_dict[ent] = embed

0it [00:00, ?it/s]

In [25]:
embedding_models["pca100_text"] = text_emb_dict

Original text embeddings (subset)

In [26]:
orig_embed_subsets_dir = "/data02/profiling/dwd-v3.class_subsets"

In [27]:
orig_embed_dict = {}
for filename in glob.glob(f"{orig_embed_subsets_dir}/*"):
    print(f"loading from file {filename}")
    embedding_df = pd.read_csv(filename, sep='\t')
    for _, row in tqdm(embedding_df.iterrows()):
        ent = row["node1"]
        embed = np.float32(row["node2"].split(','))
        orig_embed_dict[ent] = embed

loading from file /data02/profiling/dwd-v3.class_subsets/Q5.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q3624078.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q532.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q23442.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q783794.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q3305213.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q11424.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q7725634.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q571.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q47461344.tsv.gz


0it [00:00, ?it/s]

loading from file /data02/profiling/dwd-v3.class_subsets/Q4830453.tsv.gz


0it [00:00, ?it/s]

In [28]:
embedding_models["text1024"] = orig_embed_dict

## Methods for computing measure of surprise

Shuffle entity-labels table so we can randomly sample

In [134]:
%%time
df = pd.read_csv(f"{work_dir}/explainability/entity_profile_labels.RELs_and_AILs.tsv", sep='\t', dtype=str).fillna("")
df = df.sample(frac=1).reset_index(drop=True)

CPU times: user 2min 43s, sys: 26.6 s, total: 3min 9s
Wall time: 3min 9s


In [135]:
df.head()

Unnamed: 0,node1,label,node2,id
0,Q19081699,P31_profile,Q2334719_P31_Q19692072,E13647036
1,Q64186296,P1436_profile,Q2668072_P1436_0.0-0.0__Q11723795,E69951718
2,Q80894539,P59_profile,Q2154519_P59_Q8910,E56344602
3,Q422894,P131_profile,Q3700011_P131_Q10365,E30885868
4,Q6236968,P735_profile,Q5_P735_Q4925477,E42678981


In [136]:
df.to_csv(f"{work_dir}/explainability/entity_profile_labels.RELs_and_AILs.shuffled.tsv", sep='\t', index=False)

In [996]:
def get_entity_profile_labels_set(ent, ent_to_labels_dict=None):
    if ent_to_labels_dict is not None and ent in ent_to_labels_dict:
        return set(ent_to_labels_dict[ent])
    res = !kgtk query -i {work_dir}/explainability/entity_profile_labels.RELs_and_AILs.shuffled.tsv --graph-cache $STORE \
          --match 'profile_labels: (ent)-[]->(profile_label_id)' \
          --return 'distinct profile_label_id' \
          --where 'ent = "{ent}"'
    return set(kgtk_to_dataframe(res).loc[:,"node2"])
    
def get_entity_profile_labels_df(profile_labels_df, ent, ent_to_labels_dict=None):
    labels = get_entity_profile_labels_set(ent, ent_to_labels_dict)
    return profile_labels_df.loc[profile_labels_df.loc[:,"id"].isin(labels),:]

def get_entities_with_profile_label(label_id, ent_to_omit=None, limit=1000, label_to_ents_dict=None):
    if label_to_ents_dict is not None and label_id in label_to_ents_dict:
        ents = list(label_to_ents_dict[label_id])
        if ent_to_omit is not None and ent_to_omit in ents:
            ents.remove(ent_to_omit)
        return set(np.random.choice(ents, min(len(ents),limit), replace=False))
    res = !kgtk query -i {work_dir}/explainability/entity_profile_labels.RELs_and_AILs.shuffled.tsv --graph-cache $STORE \
        --match 'profile_labels: (ent)-[]->(profile_label_id)' \
        --return 'distinct ent' \
        --where 'ent != "{ent_to_omit}" AND profile_label_id = "{label_id}"' \
        --limit {limit}
    return set(kgtk_to_dataframe(res).loc[:,"node1"])

def get_ents_of_type(ent_to_omit, ent_type, limit=1000):
    res = !kgtk query -i {work_dir}/explainability/entity_profile_labels.RELs_and_AILs.shuffled.tsv --graph-cache $STORE \
        --match 'profile_labels: (ent)-[]->(profile_label_id)' \
        --return 'distinct ent' \
        --where 'ent != "{ent_to_omit}" AND printf("%.{len(ent_type)}s", profile_label_id) = "{ent_type}"' \
        --limit {limit}
    return set(kgtk_to_dataframe(res).loc[:,"node1"])

def compute_surprise_metrics_for_sample(ent, sample_ents, embedding_dict, pairwise_disp_args,
                                        pairwise_sample=10000):
    ret = {}
    
    # get embeddings
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    ent_embed = embedding_dict[ent]

    # centroid-based measures
    centroid = np.mean(sample_embeds, axis=0)
    dists = np.array([distance.cosine(centroid, e) for e in sample_embeds])
    avg_sample_to_centroid = np.mean(dists)
    ret["dispersion (centroid)"] = avg_sample_to_centroid
    ent_to_centroid = distance.cosine(centroid, ent_embed)
    ret["distance (centroid)"] = ent_to_centroid
    ret["distance(centroid)/dispersion(centroid)"] = ent_to_centroid / avg_sample_to_centroid

    # avg sample dist to entity of interest
    ent_dists = np.array([distance.cosine(ent_embed, e) for e in sample_embeds])
    avg_ent_to_sample = np.mean(ent_dists)
    ret["distance (avg pairwise)"] = avg_ent_to_sample
    ret["distance(avg pairwise)/dispersion(centroid)"] = avg_ent_to_sample / avg_sample_to_centroid

    # avg pairwise dist within sample
    avg_sample_to_sample = compute_avg_pairwise_dist_in_sample(pairwise_disp_args["fact_ids"],
                                                               pairwise_disp_args["class"],
                                                               pairwise_disp_args["emb_name"],
                                                               pairwise_sample
                                                              )
    ret["dispersion (avg pairwise)"] = avg_sample_to_sample
    ret["distance(avg pairwise)/dispersion(avg pairwise)"] = avg_ent_to_sample / avg_sample_to_sample
        
    return ret

@lru_cache(maxsize=None)
def compute_avg_pairwise_dist_in_sample(fact_ids, ent_class, emb_name, pairwise_sample=10000):
    """
    Assumptions:
        * label_to_ents_dict exists and is accessible here
        * embedding_models exists and is accessible here
    """
    # first get sample
    if fact_ids is not None:
        assert ent_class is None, "One of fact_ids or ent_class should be None."
        fact_ids = fact_ids.split("|")
        sample_ents = set()
        for fact in fact_ids:
            sample_ents = sample_ents | get_entities_with_profile_label(fact, "", pairwise_sample, label_to_ents_dict)
    elif ent_class is not None:
        sample_ents = get_ents_of_type("", ent_class, pairwise_sample)
    else:
        assert False, "Both fact_ids and ent_class are None"
        
    # choose embedding model we are using
    embedding_dict = embedding_models[emb_name]
    
    # get embeddings
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    # avg pairwise dist within sample
    sample_dists = []
    for i in range(pairwise_sample):
        e1, e2 = sample_embeds[np.random.choice(sample_embeds.shape[0], size=2, replace=False), :]
#         e1, e2 = random.sample(list(sample_embeds), 2) # Slower
        sample_dists.append(distance.cosine(e1, e2))
    avg_sample_to_sample = np.mean(sample_dists)
    return avg_sample_to_sample

def compute_avg_dist_from_ent_to_sample(ent, sample_ents, embedding_dict):
    # get embeddings
    ent_embed = embedding_dict[ent]
    sample_embeds = np.array([embedding_dict[s] for s in sample_ents if s in embedding_dict])
    # avg sample dist to entity of interest
    ent_dists = np.array([distance.cosine(ent_embed, e) for e in sample_embeds])
    avg_ent_to_sample = np.mean(ent_dists)
    return avg_ent_to_sample

def compute_surprise_metrics_for_ent_fact(ent, fact_ids, embedding_models, label_to_ents_dict=None,
                                          sample=10000, pairwise_sample=10000):
    # fact_ids are used to form the sample by taking union of samples for each fact
    ents_sharing_label = set()
    for fact in fact_ids:
        ents_sharing_label = ents_sharing_label | get_entities_with_profile_label(fact, ent, sample, label_to_ents_dict)
    
    # Information needed to get a sample of embeddings
    # When calculating pairwise dispersion of sample,
    # we'll cache the result and use these values as keys.
    pairwise_disp_args = {"fact_ids": "|".join(fact_ids),
                          "class": None,
                         }
    metrics_dict = {}
    for name, embedding_dict in embedding_models.items():
        pairwise_disp_args["emb_name"] = name
        label_metrics = compute_surprise_metrics_for_sample(ent, ents_sharing_label, embedding_dict, pairwise_disp_args, pairwise_sample)
        metrics_dict[name] = label_metrics
        
    return metrics_dict

def compute_surprise_metrics_for_df(ent, facts_df, embedding_models, label_to_ents_dict=None, sample=10000, pairwise_sample=10000):
    label_ids = facts_df.loc[:,"id"]
    
    # for each profile label, sample entities and compute surprise metrics with each embedding model
    metrics_dict = {}
    for label_id in tqdm(label_ids):
        ents_sharing_label = get_entities_with_profile_label(label_id, ent, sample, label_to_ents_dict)
        # Information needed to get a sample of embeddings
        # When calculating pairwise dispersion of sample,
        # we'll cache the result and use these values as keys.
        pairwise_disp_args = {"fact_ids": label_id,
                              "class": None,
                             }
        for name, embedding_dict in embedding_models.items():
            pairwise_disp_args["emb_name"] = name
            label_metrics = compute_surprise_metrics_for_sample(ent, ents_sharing_label, embedding_dict, pairwise_disp_args, pairwise_sample)
            for k, v in label_metrics.items():
                emb_specific_key = f"{k} - {name}"
                if emb_specific_key not in metrics_dict:
                    metrics_dict[emb_specific_key] = []
                metrics_dict[emb_specific_key].append(v)
        
    for k, v in metrics_dict.items():
        facts_df.loc[:,k] = v
    
    return facts_df

def compute_surprise_metrics_sampling_by_type(ent, ent_type, embedding_models,
                                              sample=10000, pairwise_sample=10000):
    ents_sharing_type = get_ents_of_type(ent, ent_type, sample)
    # Information needed to get a sample of embeddings
    # When calculating pairwise dispersion of sample,
    # we'll cache the result and use these values as keys.
    pairwise_disp_args = {"fact_ids": None,
                          "class": ent_type,
                         }
    metrics_dict = {}
    for name, embedding_dict in embedding_models.items():
        pairwise_disp_args["emb_name"] = name
        class_metrics = compute_surprise_metrics_for_sample(ent, ents_sharing_type, embedding_dict,
                                                            pairwise_disp_args, pairwise_sample)
        metrics_dict[name] = class_metrics
        
    return metrics_dict

# Baselines for measuring surprise
def get_surprise_scores_random(fact_ids):
    return np.random.rand(len(fact_ids))

def get_surprise_scores_freq(fact_ids, profile_labels_df):
    scores = []
    for f_ids in fact_ids:
        # handle multiple fact ids (see "canvas" answer in mc quiz)
        freq = 0
        for f_id in f_ids:
            freq += float(profile_labels_df.loc[profile_labels_df["id"].values == f_id, "support"])
        scores.append(1-freq)
    return scores


# plotting function
def plot_surprise_metrics(df, x_col, y_col, label_col, size_col, color_col=None):
    df = df.sort_values(size_col, ascending=False)
    labels = list(df.loc[:, label_col])
    x = list(df.loc[:, x_col])
    y = list(df.loc[:, y_col])
    size = np.array(df.loc[:, size_col])
    if color_col is not None:
        color = np.array(df.loc[:, color_col])
        color_dict={0: "grey", 1: "yellow", 2: "green"}
        color = [color_dict[c] for c in color]
    # rescaling
    size -= np.min(size)
    size *= (150 / np.max(size))
    size += 10
    
    fig, ax = plt.subplots()
    for i in range(len(x)):
        c = color[i] if color is not None else None
        ax.scatter(x[i], y[i], c=c, s=size[i], label=labels[i])
    
    texts = [plt.text(x[i], y[i], labels[i].split(", ")[-1]) for i in range(len(x))]
    adjust_text(texts, x=x, y=y)
#     for i, txt in enumerate(labels):
#         ax.annotate(txt, (x[i], y[i]))
    
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
    ax.set_ylabel(y_col)
    ax.set_xlabel(x_col)
    ax.set_title(size_col)
    ax.grid(True)
    
    fig.set_figwidth(8)
    fig.set_figheight(8)

    plt.show()

## Load MC quiz data

In [969]:
questions_orig = questions

In [975]:
with open(surprise_data_file, 'r') as f:
    questions = json.load(f)

Add profile label ids for each answer and an empty dict for storing computed method surprise scores

In [976]:
all_profile_labels = set(profile_labels_df.loc[:,"id"])
# reconstructing and validating profile labels for each question / answer
for i, q in enumerate(questions):
    for answer in q["answers"]:
        ans_fact_ids = []
        for qnode in answer["qnodes"]:
            fact_id = "{}_{}_{}".format(q["class"], q["property"], qnode)
            if q["wd_units"] is not None:
                fact_id += "__" + q["wd_units"]
            assert fact_id in all_profile_labels, f"{fact_id} not found in loaded profile labels\n" +\
                f"Question {i}: \'{q['lexicalized']}\'"
            ans_fact_ids.append(fact_id)
        answer["fact_ids"] = ans_fact_ids
        # also initialize dict for storing computed surprise scores
        answer["method_surprise_scores"] = {}
    q["method_spearman"] = {}
    q["method_kendalltau"] = {}

### Compute surprise of all facts in quiz using our method

In [992]:
surprise_metric_abbrevs = {'distance(avg pairwise)/dispersion(avg pairwise)': "ap/ap",
                           'distance(centroid)/dispersion(centroid)': "c/c",
                           'distance(avg pairwise)/dispersion(centroid)': "ap/c"
                          }

for q, answer in tqdm([(q, answer) for q in questions for answer in q["answers"]]):
    ent = q["entity"]
    fact_ids = answer["fact_ids"]
    emb_metrics_dict = compute_surprise_metrics_for_ent_fact(ent, fact_ids, embedding_models, label_to_ents_dict)
    for emb_name, metrics_dict in emb_metrics_dict.items():
        for metric_name, abbrev in surprise_metric_abbrevs.items():
            method_name = f"{emb_name}, {abbrev}"
            answer["method_surprise_scores"][method_name] = metrics_dict[metric_name]

  0%|          | 0/118 [00:00<?, ?it/s]

## Correlation of facts within single question

### Baselines

In [998]:
num_trials = 500

for q in tqdm(questions):
    
    ans_surprise_gts = [ans["gt_surprise"] for ans in q["answers"]]
    fact_ids = [ans["fact_ids"] for ans in q["answers"]]
    
    # random
    q["method_spearman"]["random"] = []
    q["method_kendalltau"]["random"] = []
    for i in range(num_trials):
        random_preds = get_surprise_scores_random(fact_ids)
        r, r_pval = spearmanr(ans_surprise_gts, random_preds)
        t, t_pval = kendalltau(ans_surprise_gts, random_preds)
        q["method_spearman"]["random"].append(r)
        q["method_kendalltau"]["random"].append(t)
    q["method_spearman"]["random"] = np.mean(q["method_spearman"]["random"])
    q["method_kendalltau"]["random"] = np.mean(q["method_kendalltau"]["random"])
    
    # freq
    freq_preds = get_surprise_scores_freq(fact_ids, profile_labels_df)
    # if freqs are all the same, fall back to random.
    if all(freq_preds[0] == np.array(freq_preds)):
        q["method_spearman"]["frequency"] = q["method_spearman"]["random"]
        q["method_kendalltau"]["frequency"] = q["method_kendalltau"]["random"]
    else:
        r, r_pval = spearmanr(ans_surprise_gts, freq_preds)
        t, t_pval = kendalltau(ans_surprise_gts, freq_preds)
        q["method_spearman"]["frequency"] = r
        q["method_kendalltau"]["frequency"] = t
header = ["", "Spearman", "KT"]
rows = []
for method in ["random", "frequency"]:
    r = np.mean([q["method_spearman"][method] for q in questions])
    t = np.mean([q["method_kendalltau"][method] for q in questions])
    rows.append([method,
                 f"{r:.3f}",
                 f"{t:.3f}",
                ])
df = pd.DataFrame(rows, columns=header)
display(df)

  0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,Unnamed: 1,Spearman,KT
0,random,0.003,0.003
1,frequency,0.066,0.074


Now splitting by different question subsets

In [1004]:
qnode_questions = [q for q in questions if not q["is_numeric_answer"]]
numeric_questions = [q for q in questions if q["is_numeric_answer"]]
single_ans_questions = [q for q in questions if q["is_single_answer"]]
multi_ans_questions = [q for q in questions if not q["is_single_answer"]]

In [1008]:
question_subsets = {"qnode": qnode_questions,
                    "num": numeric_questions,
                    "single": single_ans_questions,
                    "multi": multi_ans_questions
                   }
header = [""]
for name in question_subsets:
    header.append(f"Rho ({name})")
    header.append(f"Tau ({name})")

In [1009]:
rows = []
for method in ["random", "frequency"]:
    row = [method]
    for question_subset in question_subsets.values():
        r = np.mean([q["method_spearman"][method] for q in question_subset])
        t = np.mean([q["method_kendalltau"][method] for q in question_subset])
        row.append(f"{r:.3f}")
        row.append(f"{t:.3f}")
    rows.append(row)
df = pd.DataFrame(rows, columns=header)
display(df)

Unnamed: 0,Unnamed: 1,Rho (qnode),Tau (qnode),Rho (num),Tau (num),Rho (single),Tau (single),Rho (multi),Tau (multi)
0,random,-0.003,-0.002,0.024,0.019,0.023,0.019,-0.005,-0.003
1,frequency,0.043,0.055,0.134,0.129,0.108,0.095,0.049,0.065


### Statistical methods

#### Correlation of facts within each question

Compute per-question correlation

In [1003]:
for q in questions:
    ans_surprise_gts = [ans["gt_surprise"] for ans in q["answers"]]
    for method_name in q["answers"][0]["method_surprise_scores"]:
        ans_surprise_preds = [ans["method_surprise_scores"][method_name] for ans in q["answers"]]
        rho, _ = spearmanr(ans_surprise_gts, ans_surprise_preds)
        tau, _ = kendalltau(ans_surprise_gts, ans_surprise_preds)
        q["method_spearman"][method_name] = rho
        q["method_kendalltau"][method_name] = tau

view avg correlation in question

In [1010]:
rows = []
for method_name in questions[0]["method_spearman"]:
    row = [method_name]
    for question_subset in question_subsets.values():
        r = np.mean([q["method_spearman"][method_name] for q in question_subset])
        t = np.mean([q["method_kendalltau"][method_name] for q in question_subset])
        row.append(f"{r:.3f}")
        row.append(f"{t:.3f}")
    rows.append(row)
    
df = pd.DataFrame(rows, columns=header)
display(df)
            

Unnamed: 0,Unnamed: 1,Rho (qnode),Tau (qnode),Rho (num),Tau (num),Rho (single),Tau (single),Rho (multi),Tau (multi)
0,random,-0.003,-0.002,0.024,0.019,0.023,0.019,-0.005,-0.003
1,frequency,0.043,0.055,0.134,0.129,0.108,0.095,0.049,0.065
2,"complex, ap/ap",0.551,0.457,0.481,0.42,0.53,0.465,0.535,0.44
3,"complex, c/c",0.565,0.48,0.388,0.327,0.45,0.386,0.55,0.464
4,"complex, ap/c",0.54,0.446,0.355,0.271,0.422,0.338,0.523,0.428
5,"pca100_text, ap/ap",0.463,0.397,0.39,0.285,0.4,0.289,0.463,0.402
6,"pca100_text, c/c",0.47,0.399,0.424,0.34,0.429,0.337,0.47,0.404
7,"pca100_text, ap/c",0.431,0.361,0.297,0.247,0.321,0.257,0.429,0.364
8,"text1024, ap/ap",0.563,0.491,0.542,0.466,0.589,0.505,0.545,0.476
9,"text1024, c/c",0.563,0.49,0.49,0.394,0.544,0.443,0.545,0.476


### The below sections can be skipped for now, up until "Link Prediction Method"
#### Correlation of facts across all questions (NO normalization)

In [742]:
max_count = 26 # 26 test takers
emb_name = 'text1024'
surprise_metric = 'distance(avg pairwise)/dispersion(avg pairwise)'
single_ans_gts = []
single_ans_preds = []
multi_ans_gts = []
multi_ans_preds = []
for q in questions:
    answers = list(q["pred_counts"].keys())
    if q["single_answer"]:
        single_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        single_ans_preds.extend([q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric] for ans in answers])
    else:
        multi_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        multi_ans_preds.extend([q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric] for ans in answers])

print("correlation across all facts (no normalization)")
print("\tmulti-answer questions only")
rho, rho_pval = spearmanr(multi_ans_gts, multi_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts, multi_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tsingle-answer questions only")
rho, rho_pval = spearmanr(single_ans_gts, single_ans_preds)
tau, tau_pval = kendalltau(single_ans_gts, single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tall questions")
rho, rho_pval = spearmanr(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")

correlation across all facts (no normalization)
	multi-answer questions only
		Spearman: 0.274, pval=0.011
		KT: 0.182, pval=0.016
	single-answer questions only
		Spearman: 0.294, pval=0.097
		KT: 0.213, pval=0.092
	all questions
		Spearman: 0.142, pval=0.125
		KT: 0.086, pval=0.175


**Hypothesis:** Using # of people who chose the answer to infer surprise/unexpectedness will not be directly comparable between single and multi-answer questions. In multi-answer questions, every answer can have up to the total number of participants choose it, making it similar to if we had asked each answer as a separate true/false question. Meanwhile, in single-answer questions, picking one answer means the participant cannot choose any other answers. This could mean that even if no answer is very surprising, some answers may have very few people choose it simply because there was a similar answer that seemed like a safer bet.

**observation:** The above result supports this hypothesis.

**What to do about this:** Either evaluate single and multi answers separately or come up with another surprise-score-inference method that is more comparable across the two kinds of questions.

Let's check if the effect we are seeing above is due to the decrease in number of facts being compared...

In [743]:
print(f"len(single_ans_gts): {len(single_ans_gts)}")
print(f"len(multi_ans_gts): {len(multi_ans_gts)}")

len(single_ans_gts): 33
len(multi_ans_gts): 85


In [744]:
spearmanr(multi_ans_gts[:33] + single_ans_gts, multi_ans_preds[:33] + single_ans_preds)

SpearmanrResult(correlation=-0.03149116526040072, pvalue=0.8018067539647361)

In [745]:
spearmanr(multi_ans_gts[:33], multi_ans_preds[:33])

SpearmanrResult(correlation=0.17886051056359697, pvalue=0.3192897690462346)

In [746]:
spearmanr(single_ans_gts,single_ans_preds)

SpearmanrResult(correlation=0.2939105369545495, pvalue=0.09687748220540489)

Safe to say it is not due to decrease in size.

#### Correlation of facts across all questions (with normalization)

Start by gathering samples for each class we are dealing with

In [699]:
# ent_class_to_sample = {}
# for q in tqdm(questions):
#     ent_class = q["class"]
#     if ent_class not in ent_class_to_sample:
#         ent_class_to_sample[ent_class] = get_ents_of_type("", ent_class, limit=10000)

  0%|          | 0/24 [00:00<?, ?it/s]

Now we can compute dispersion of each class once

In [None]:
# class_dispersion_dict = {}
# for ent_class, sample_ents in tqdm(ent_class_to_sample.items()):
#     class_dispersion_dict[ent_class] = compute_avg_pairwise_dist_in_sample(sample_ents, embedding_dict, pariwise_sample=10000)

# compute_avg_dist_from_ent_to_sample(ent, sample_ents, embedding_dict)

ignore above, changed functions to use lru cache

In [730]:
for q in tqdm(questions):
    q["class_surprise_metrics_dict"] = compute_surprise_metrics_sampling_by_type(q["entity"], q["class"], embedding_models)

  0%|          | 0/24 [00:00<?, ?it/s]

In [748]:
max_count = 26 # 26 test takers
emb_name = 'text1024'
surprise_metric = 'distance(avg pairwise)/dispersion(avg pairwise)'
single_ans_gts = []
single_ans_preds = []
multi_ans_gts = []
multi_ans_preds = []
for q in questions:
    answers = list(q["pred_counts"].keys())
    ans_surprise_gts.extend([max_count - count for count in q["pred_counts"].values()])
    q_ans_surprise_preds = []
    surprise_of_ent_in_class = q["class_surprise_metrics_dict"][emb_name][surprise_metric]
    for ans in answers:
        unnormalized_surprise = q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric]
        q_ans_surprise_preds.append(unnormalized_surprise / surprise_of_ent_in_class)
    if q["single_answer"]:
        single_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        single_ans_preds.extend(q_ans_surprise_preds)
    else:
        multi_ans_gts.extend([max_count - count for count in q["pred_counts"].values()])
        multi_ans_preds.extend(q_ans_surprise_preds)

print("correlation across all facts (WITH normalization)")
print("\tmulti-answer questions only")
rho, rho_pval = spearmanr(multi_ans_gts, multi_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts, multi_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tsingle-answer questions only")
rho, rho_pval = spearmanr(single_ans_gts, single_ans_preds)
tau, tau_pval = kendalltau(single_ans_gts, single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")
print("\tall questions")
rho, rho_pval = spearmanr(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
tau, tau_pval = kendalltau(multi_ans_gts + single_ans_gts, multi_ans_preds + single_ans_preds)
print(f"\t\tSpearman: {rho:.3f}, pval={rho_pval:.3f}")
print(f"\t\tKT: {tau:.3f}, pval={tau_pval:.3f}")

correlation across all facts (WITH normalization)
	multi-answer questions only
		Spearman: 0.354, pval=0.001
		KT: 0.246, pval=0.001
	single-answer questions only
		Spearman: 0.191, pval=0.287
		KT: 0.150, pval=0.235
	all questions
		Spearman: 0.186, pval=0.044
		KT: 0.122, pval=0.055


**Observation:** As I expected, normalization helps with comparing facts about different entities when looking at multi-answer questions. However, it appears to hurt on the single answer questions. Why could this be?? We do have a much larger p-value for the single answer correlation here, so maybe it is by chance.

**Note** I do not think we can say that normalization helps in general here since the increase in correlation on "all-questions" could be due to the higher number of multi-answer question facts.

Look at what normalized values look like

In [736]:
for q in sorted(questions, key=lambda q: q["spearman"][emb_name][surprise_metric]):
    answers = list(q["pred_counts"].keys())
    ans_surprise_gts = [max_count - count for count in q["pred_counts"].values()]
    ans_surprise_preds = [q["ans_to_surprise_metrics_dict"][ans][emb_name][surprise_metric] for ans in answers]
    surprise_of_ent_in_class = q["class_surprise_metrics_dict"][emb_name][surprise_metric]
    ans_surprise_preds_norm = [s / surprise_of_ent_in_class for s in ans_surprise_preds]
    gt_order = np.argsort(ans_surprise_gts)
    rows = [
        [""] + [answers[i] for i in gt_order],
        ["gt surprise"] + [str(ans_surprise_gts[i]) for i in gt_order],
        ["pred surprise"] + [f"{ans_surprise_preds[i]:.2f}" for i in gt_order],
        ["normalized pred"] + [f"{ans_surprise_preds_norm[i]:.2f}" for i in gt_order]
    ]
    display(pd.DataFrame(rows[1:], columns=rows[0]))
    print(f"spearman r: {q['spearman'][emb_name][surprise_metric]}")

Unnamed: 0,Unnamed: 1,Singer-songwriter,Dancer,Entrepreneur,Model,Voice Actor
0,gt surprise,1.0,7.0,9.0,11.0,14.0
1,pred surprise,1.48,1.32,1.6,1.12,1.47
2,normalized pred,1.19,1.06,1.28,0.9,1.18


spearman r: -0.3


Unnamed: 0,Unnamed: 1,Painter,Engineer,Chemist,Zoologist,Diplomat
0,gt surprise,0.0,5.0,14.0,19.0,20.0
1,pred surprise,1.66,1.17,1.32,1.36,1.21
2,normalized pred,1.66,1.17,1.32,1.36,1.22


spearman r: -0.3


Unnamed: 0,Unnamed: 1,> 240,> 210 and <= 240,> 195 and <= 210,> 1 and <= 181,> 181 and <= 195
0,gt surprise,15.0,15.0,22.0,26.0,26.0
1,pred surprise,2.82,1.39,1.38,1.65,1.45
2,normalized pred,2.01,0.99,0.99,1.18,1.03


spearman r: 0.0


Unnamed: 0,Unnamed: 1,Amateur Wrestling,Boxing,Rugby,Baseball,American Football
0,gt surprise,8.0,17.0,22.0,23.0,24.0
1,pred surprise,4.3,2.17,2.57,2.73,3.74
2,normalized pred,3.85,1.94,2.3,2.44,3.35


spearman r: 0.0


Unnamed: 0,Unnamed: 1,Republican Party,Democratic Party,Communist Party of the Soviet Union,National Socialist German Workers' Party,Social Democratic Party of Germany
0,gt surprise,0.0,10.0,26.0,26.0,26.0
1,pred surprise,1.6,1.7,1.76,1.52,1.83
2,normalized pred,1.14,1.21,1.25,1.08,1.31


spearman r: 0.3354101966249684


Unnamed: 0,Unnamed: 1,Basketball Player,Television Actor,Writer,Screenwriter,Researcher
0,gt surprise,0.0,13.0,22.0,23.0,25.0
1,pred surprise,1.91,1.61,1.66,1.69,3.91
2,normalized pred,1.67,1.41,1.45,1.48,3.42


spearman r: 0.39999999999999997


Unnamed: 0,Unnamed: 1,62.5 to 65.0,55.0 to 60.0,60.0 to 62.5,65.25 to 67.0
0,gt surprise,16.0,19.0,21.0,22.0
1,pred surprise,0.86,1.0,1.01,0.87
2,normalized pred,0.98,1.14,1.15,1.0


spearman r: 0.39999999999999997


Unnamed: 0,Unnamed: 1,Romantic Comedy,Fantasy Film,Film Based on a Novel,Drama,Musical Film
0,gt surprise,8.0,10.0,15.0,23.0,23.0
1,pred surprise,1.02,1.09,1.08,1.11,1.06
2,normalized pred,0.93,1.0,0.98,1.01,0.97


spearman r: 0.46169025843831935


Unnamed: 0,Unnamed: 1,8.4 to 27.9,2.6 to 5.7,1.5 to 2.5,0.3 to 1.4,0.0 to 0.2
0,gt surprise,15.0,15.0,24.0,25.0,25.0
1,pred surprise,1.37,1.09,1.13,1.16,1.62
2,normalized pred,1.1,0.88,0.91,0.94,1.31


spearman r: 0.47434164902525683


Unnamed: 0,Unnamed: 1,Voice Actor,Musician,Politician,Writer,Chess Player
0,gt surprise,4.0,16.0,22.0,23.0,25.0
1,pred surprise,1.19,1.81,1.64,1.54,5.48
2,normalized pred,1.07,1.62,1.47,1.38,4.9


spearman r: 0.6


Unnamed: 0,Unnamed: 1,"438,000,000 to 1,590,000,000","1,610,000,000 to 6,745,000,000","113,000,000 to 427,800,000","6,764,000,000 to 217,267,000,000","1 to 108,589,000"
0,gt surprise,15.0,20.0,21.0,22.0,26.0
1,pred surprise,0.9,0.87,0.85,0.95,0.98
2,normalized pred,0.75,0.72,0.7,0.79,0.81


spearman r: 0.6


Unnamed: 0,Unnamed: 1,Singer,Politician,Film Director,Architect,Sport Cyclist
0,gt surprise,0.0,13.0,18.0,26.0,26.0
1,pred surprise,1.83,1.88,1.82,2.0,3.38
2,normalized pred,1.41,1.45,1.4,1.54,2.6


spearman r: 0.6668859288553503


Unnamed: 0,Unnamed: 1,76.6 to 85.4,73.9 to 76.6,70.2 to 73.8,63.3 to 69.9,51.8 to 63.2
0,gt surprise,18.0,18.0,19.0,23.0,26.0
1,pred surprise,1.11,1.12,1.11,1.28,1.44
2,normalized pred,1.12,1.12,1.11,1.28,1.44


spearman r: 0.6668859288553503


Unnamed: 0,Unnamed: 1,Television Presenter,Writer,Television Actor,Film Actor,Film Producer
0,gt surprise,4.0,9.0,14.0,16.0,19.0
1,pred surprise,1.17,1.64,1.23,1.37,1.78
2,normalized pred,0.96,1.35,1.0,1.13,1.46


spearman r: 0.7


Unnamed: 0,Unnamed: 1,History Painting,Cityscape,Landscape Art,Portrait,Self-Portrait
0,gt surprise,6.0,19.0,21.0,23.0,26.0
1,pred surprise,0.82,0.78,0.8,0.86,1.39
2,normalized pred,1.08,1.04,1.06,1.14,1.84


spearman r: 0.7


Unnamed: 0,Unnamed: 1,Russian,English,German,Swedish,Spanish
0,gt surprise,1.0,13.0,19.0,20.0,26.0
1,pred surprise,1.14,1.12,1.16,1.43,1.32
2,normalized pred,0.98,0.96,1.0,1.23,1.14


spearman r: 0.7999999999999999


Unnamed: 0,Unnamed: 1,Politician,Military Officer,Painter,Rugby Union Player,Singer
0,gt surprise,1.0,12.0,16.0,24.0,26.0
1,pred surprise,1.61,1.45,2.25,2.66,2.34
2,normalized pred,1.26,1.14,1.76,2.09,1.84


spearman r: 0.7999999999999999


Unnamed: 0,Unnamed: 1,Landscape,Sky,Mountain,Virgin Mary,Bridge
0,gt surprise,10.0,11.0,15.0,18.0,21.0
1,pred surprise,1.15,1.09,1.22,1.29,1.23
2,normalized pred,1.01,0.95,1.06,1.13,1.07


spearman r: 0.7999999999999999


Unnamed: 0,Unnamed: 1,Oil Paint,Canvas,Cardboard,Paper,Tempera
0,gt surprise,0.0,5.0,24.0,24.0,25.0
1,pred surprise,1.6,1.63,1.7,1.85,1.72
2,normalized pred,1.0,1.02,1.06,1.15,1.07


spearman r: 0.8207826816681234


Unnamed: 0,Unnamed: 1,London,New York City,Hamburg,Paris,Rome
0,gt surprise,8.0,20.0,24.0,26.0,26.0
1,pred surprise,1.2,1.35,1.34,1.37,1.54
2,normalized pred,0.95,1.07,1.06,1.09,1.22


spearman r: 0.8720815992723809


Unnamed: 0,Unnamed: 1,Singer-songwriter,Film Producer,Entrepreneur,Author,Painter
0,gt surprise,1.0,12.0,14.0,19.0,26.0
1,pred surprise,1.54,1.59,1.55,1.94,2.14
2,normalized pred,1.23,1.27,1.24,1.55,1.71


spearman r: 0.8999999999999998


Unnamed: 0,Unnamed: 1,English,French,German,Russian,Swedish
0,gt surprise,2.0,12.0,15.0,24.0,25.0
1,pred surprise,1.11,1.29,1.23,1.44,1.47
2,normalized pred,0.96,1.12,1.07,1.25,1.27


spearman r: 0.8999999999999998


Unnamed: 0,Unnamed: 1,Switzerland,South Africa,United States of America,France,South Korea
0,gt surprise,9.0,18.0,20.0,20.0,26.0
1,pred surprise,1.07,1.1,1.16,1.19,1.31
2,normalized pred,1.03,1.06,1.11,1.15,1.26


spearman r: 0.9746794344808963


Unnamed: 0,Unnamed: 1,4,3,2 or fewer,5 or more
0,gt surprise,16.0,19.0,20.0,23.0
1,pred surprise,0.97,0.99,0.99,0.99
2,normalized pred,0.82,0.84,0.84,0.84


spearman r: 1.0


#### Correlation of true facts (according to WD)

#### Correlation of true facts (according to Google)

## Link prediction method

In [907]:
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator, CosComparator
from sklearn.metrics.pairwise import cosine_similarity

In [900]:
complex_dir = "/data02/profiling/wikidata-20210215-dwd-v2.complex-embeddings"
vector_dimension = 100

In [901]:
%%time
print("loading relation names...")
relation_names_list = json.load(open(f"{complex_dir}/dynamic_rel_names.json"))
print("loading entity names...")
entity_names_list = json.load(open(f"{complex_dir}/entity_names_all_0.json"))
prop_count = len(relation_names_list)

# operators
print("setting up operators...")
operator_lhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
operator_rhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
comparator = DotComparator()
cos_comparator = CosComparator()
with h5py.File(f"{complex_dir}/model/model.v600.h5", "r") as hf:
    operator_state_dict_lhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/lhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/lhs/imag"][...]),
    }
    operator_state_dict_rhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
    }

print("loading operator state...")
operator_lhs.load_state_dict(operator_state_dict_lhs)
operator_rhs.load_state_dict(operator_state_dict_rhs)

# Load the embeddings
print("loading all embeddings...")
with h5py.File(f"{complex_dir}/model/embeddings_all_0.v600.h5", "r") as hf:
    complex_embeddings_for_lp = torch.from_numpy(hf["embeddings"][...])


print("creating entity-to-index dict...")
entity_to_index = {}
for i, entity in enumerate(entity_names_list):
    entity_to_index[entity] = i
    

print("creating relation-to-index dict...")
rel_index = {}
for i, rel in enumerate(relation_names_list):
    rel_index[rel] = i

loading relation names...
loading entity names...
setting up operators...
loading operator state...
loading all embeddings...
creating entity-to-index dict...
creating relation-to-index dict...
CPU times: user 42.1 s, sys: 36.9 s, total: 1min 19s
Wall time: 1min 38s


In [903]:
def complex_link_predict(head, relation=None):
    ''' This function generate the embeddings for the tail entities:
            Head entities: Obtained from the model
            Head + relation: Obtained using torch
        :param head: subject Qnode
        :param relation: optional property
    '''
    if relation is None:
        return complex_embeddings_for_lp[entity_to_index[head], :].detach().numpy()
    return  operator_lhs(
                complex_embeddings_for_lp[entity_to_index[head], :].view(1, vector_dimension),
                torch.tensor([rel_index[relation]])
            ).detach().numpy()[0]

In [917]:
q = questions[0]
pred_emb = complex_link_predict(q["entity"], q["prop"])
for ans, qnode in q["ans_to_qnode"].items():
    cos_sim = cosine_similarity(embedding_models["complex"][qnode][None,:], pred_emb[None,:])[0][0]
    print(f"{ans}: {cos_sim:.3f}")

English: 0.032
German: 0.032
Russian: 0.094
Swedish: 0.005
Spanish: -0.056


In [936]:
print("printing skipped questions...")
for q in questions:
    # skip questions that have non-qnode answers (can't predict)
    if list(q["ans_to_qnode"].values())[0] not in embedding_models["complex"]:
        print(f'\t{q["lexicalized"]}')
        continue
    q["ans_cos_sim_to_lp"] = {}
    pred_emb = complex_link_predict(q["entity"], q["prop"])
    for ans, qnode in q["ans_to_qnode"].items():
        # if multiple qnodes correspond to answer, use average of their cosine sims
        # *** There may be other better approaches, e.g. weight these by support
        if "|" in qnode:
            qnodes = qnode.split("|")
            cos_sim = np.mean(cosine_similarity([embedding_models["complex"][q] for q in qnodes], pred_emb[None,:]))
        else:
            cos_sim = cosine_similarity(embedding_models["complex"][qnode][None,:], pred_emb[None,:])[0][0]
        q["ans_cos_sim_to_lp"][ans] = cos_sim

printing skipped questions...
	How many children does Arnold Schwarzenegger have?
	What is Donald Trump's mass in pounds circa. 2019?
	What is the life expectancy in years of Australia circa. 2016?
	What is the retirement age in Colombia? The answer for either men or women will be accepted.
	What percentage of the territory of Canada inside its coast line and international boundaries is water?
	What was the total revenue in euros of the business "Adidas" circa. 2014?


In [937]:
for q in questions:
    if "ans_cos_sim_to_lp" not in q:
        continue
    answers = q["pred_counts"].keys()
    ans_surprise_gts = [max_count - count for count in q["pred_counts"].values()]
    ans_surprise_preds = [-1*q["ans_cos_sim_to_lp"][ans] for ans in answers]
    q["complex_link_prediction"] = {}
    r, r_pval = spearmanr(ans_surprise_gts, ans_surprise_preds)
    t, t_pval = kendalltau(ans_surprise_gts, ans_surprise_preds)
    q["complex_link_prediction"]["spearman"] = r
    q["complex_link_prediction"]["kendalltau"] = t
header = ["", "Spearman", "KT"]
rows = []
for method in ["complex_link_prediction", "random", "freq"]:
    r = np.mean([q[method]['spearman'] for q in questions if "ans_cos_sim_to_lp" in q], axis=0)
    t = np.mean([q[method]['kendalltau'] for q in questions if "ans_cos_sim_to_lp" in q], axis=0)
    rows.append([method,
                 f"{r:.3f}",
                 f"{t:.3f}",
                ])
df = pd.DataFrame(rows, columns=header)
display(df)

Unnamed: 0,Unnamed: 1,Spearman,KT
0,complex_link_prediction,0.475,0.413
1,random,-0.0,-0.002
2,freq,0.043,0.055
