In [20]:
import os
from kgtk.configure_kgtk_notebooks import ConfigureKGTK
from kgtk.functions import kgtk, kypher
from gensim.models import KeyedVectors
import tempfile
import pandas as pd
import numpy as np
import h5py, torch
from torchbiggraph.model import ComplexDiagonalDynamicOperator, DotComparator, CosComparator
import json
import time

In [21]:
ls /tmp/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/output

dynamic_rel_count.txt   entities_output.tsv      [0m[38;5;27mmodel[0m/
dynamic_rel_names.json  entity_count_all_0.txt   relation_types_tf.tsv
[38;5;27medges_partitioned[0m/      entity_names_all_0.json


In [22]:
embeddings = pd.read_csv('/tmp/projects/tutorial-graph-embeddings/arnold.embeddings.augmented.100.tsv', sep = '\t')

In [23]:
embeddings

Unnamed: 0,node1,label,node2
0,Q3533911,graph_embeddings,"0.003262148,0.045550559,0.005054940,-0.4102364..."
1,Q263739,graph_embeddings,"0.057031732,-0.298023194,0.098767348,-0.142428..."
2,Q6447867,graph_embeddings,"0.014380087,0.034869879,0.091715589,-0.3172882..."
3,Q25241406,graph_embeddings,"-0.120991871,0.044525474,-0.116639368,-0.31421..."
4,Q20814143,graph_embeddings,"-0.026510656,0.151938215,0.016863573,-0.529662..."
...,...,...,...
4594144,Q985395,graph_embeddings,"0.057080340,-0.214824334,-0.446722478,-0.24329..."
4594145,Q7446683,graph_embeddings,"0.015740165,-0.329518527,0.029669814,-0.382958..."
4594146,Q16253932,graph_embeddings,"-0.015442700,-0.044929687,0.087926723,-0.60552..."
4594147,Q16203563,graph_embeddings,"0.001286899,-0.102949277,-0.152381063,0.198757..."


## For Complex Embeddings

In [50]:
relation_names_list = json.load(open(f"/tmp/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/output/dynamic_rel_names.json"))
entity_names_list = json.load(open(f"/tmp/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/output/entity_names_all_0.json"))
prop_count = len(relation_names_list)
vector_dimension = 100
# operators
operator_lhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
operator_rhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
comparator = DotComparator()
cos_comparator = CosComparator()
with h5py.File(f"/tmp/projects/tutorial-graph-embeddings/temp.tutorial-graph-embeddings/output/model/model.v100.h5", "r") as hf:
    operator_state_dict_lhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/lhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/lhs/imag"][...]),
    }

    operator_state_dict_rhs = {
        "real": torch.from_numpy(hf["model/relations/0/operator/rhs/real"][...]),
        "imag": torch.from_numpy(hf["model/relations/0/operator/rhs/imag"][...]),
    }
    
operator_lhs.load_state_dict(operator_state_dict_lhs)
operator_rhs.load_state_dict(operator_state_dict_rhs)



entity_to_index = {}
for i, entity in enumerate(entity_names_list):
    entity_to_index[entity] = i
    

rel_index = {}
for i, rel in enumerate(relation_names_list):
    rel_index[rel] = i

In [51]:
#r s o
def scalar_triple_dot(a1, a2, b, c):
    return (a1 * b[0:50]).dot(c[0:50]) + (a1 * b[50:]).dot(c[50:]) + (a2 * b[0:50]).dot(c[50:]) -(a2 * b[50:]).dot(c[0:50])

In [52]:
#link prediction

def link_predict(node1, label, candidates, topn = 25):
    nodes = []
    scores = []
    for cand in candidates:
        emb_s = torch.tensor([float(x) for x in embeddings.node2[entity_to_index[node1]].split(',')])
        emb_o = torch.tensor([float(x) for x in embeddings.node2[entity_to_index[cand]].split(',')])
        score = scalar_triple_dot(operator_state_dict_lhs['real'][rel_index[label]], operator_state_dict_lhs['imag'][rel_index[label]],emb_s, emb_o)
        scores.append(float(score))
        nodes.append(cand)
    df_temp = pd.DataFrame(list(zip(nodes, scores)), columns = ['node1', 'score'])
    df_temp = df_temp.sort_values(by = 'score', ascending = False).reset_index(drop = True).head(topn)
    return df_temp

In [None]:
# do link prediction, get outputting hit rates
results = [0,0,0,0,0]
df = pd.read_csv('wikidata/wikidata5m_transductive_test.tsv', sep = '\t')
candidates = list(set(list(df.node2)))
for i in range(0, len(df)):
    cand_to_use = []
    if df['node2'][i] not in candidates:
        cand_to_use = list(candidates)[0:499].copy()
        cand_to_use.append(df['node2'][i])
    else:
        cand_to_use = candidates.copy()
    #print(df['node1'][i], df['label'][i])
    try:
        temp = link_predict(df['node1'][i], df['label'][i], cand_to_use, topn = 1000)
    except:
        continue
    print(temp)
    #print(df['node1'][i], df['label'][i], df.node2[i])
    #print(temp.node1[0])
    if temp.node1[0] == df.node2[i]:
        results[0] += 1
        
    if df.node2[i] in list(temp.node1[0:3]):
        results[1] += 1
        
        
    if df.node2[i] in list(temp.node1[0:10]):
        results[2] += 1
    if df.node2[i] in list(temp.node1[0:100]):
        results[3] += 1
        
    if df.node2[i] in list(temp.node1[0:1000]):
        results[4] += 1

In [None]:
results

In [None]:
ls wikidata5m_embed/output/model

## For Dismult Embeddings

In [None]:
#r s o complex scoring function
def scalar_triple_dot_real(r, s, o):
    return (r * s).dot(o)

In [None]:
# do link prediction, get outputting hit rates

embeddings = pd.read_csv('wikidata5m_embed/dismult.tsv', sep = '\t')
relation_names_list = json.load(open(f"wikidata5m_embed/output/dynamic_rel_names.json"))
entity_names_list = json.load(open(f"wikidata5m_embed/output/entity_names_all_0.json"))
prop_count = len(relation_names_list)
vector_dimension = 100
# operators
operator_lhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
operator_rhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
comparator = DotComparator()
cos_comparator = CosComparator()
with h5py.File(f"wikidata5m_embed/output/model/model.v100.h5", "r") as hf:
    operator_state_dict_lhs = {
        "transition": torch.from_numpy(hf["model/relations/0/operator/lhs/diagonals"][...]),
    }


entity_to_index = {}
for i, entity in enumerate(entity_names_list):
    entity_to_index[entity] = i
    

rel_index = {}
for i, rel in enumerate(relation_names_list):
    rel_index[rel] = i

In [None]:
def link_predict_dismult(node1, label, candidates, topn = 25):
    nodes = []
    scores = []
    for cand in candidates:
        emb_s = torch.tensor([float(x) for x in embeddings.node2[entity_to_index[node1]].split(',')])
        emb_o = torch.tensor([float(x) for x in embeddings.node2[entity_to_index[cand]].split(',')])
        score = scalar_triple_dot_real(operator_state_dict_lhs['transition'][rel_index[label]],emb_s, emb_o)
        scores.append(float(score))
        nodes.append(cand)
    df_temp = pd.DataFrame(list(zip(nodes, scores)), columns = ['node1', 'score'])
    df_temp = df_temp.sort_values(by = 'score', ascending = False).reset_index(drop = True).head(topn)
    return df_temp

In [None]:
len(entity_to_index)

In [None]:
#candidates = embeddings.sample(500).node1

df = link_predict_dismult('Q170564', 'P161', candidates)
df

In [None]:

results = [0,0,0,0,0]
df = pd.read_csv('wikidata/wikidata5m_transductive_test.tsv', sep = '\t')
for i in range(0, len(df)):
    cand_to_use = []
    if df['node2'][i] not in candidates:
        cand_to_use = list(candidates)[0:499].copy()
        cand_to_use.append(df['node2'][i])
    else:
        cand_to_use = candidates.copy()
    #print(df['node1'][i], df['label'][i])
    try:
        temp = link_predict_dismult(df['node1'][i], df['label'][i], cand_to_use, topn = 1000)
    except:
        continue
    print(temp)
    #print(df['node1'][i], df['label'][i], df.node2[i])
    #print(temp.node1[0])
    if temp.node1[0] == df.node2[i]:
        results[0] += 1
        
    if df.node2[i] in list(temp.node1[0:3]):
        results[1] += 1
        
        
    if df.node2[i] in list(temp.node1[0:10]):
        results[2] += 1
    if df.node2[i] in list(temp.node1[0:100]):
        results[3] += 1
        
    if df.node2[i] in list(temp.node1[0:1000]):
        results[4] += 1

In [None]:
results

## For Transe Embeddings

In [None]:
embeddings = pd.read_csv('wikidata5m_embed_transe/transe.tsv', sep = '\t')
relation_names_list = json.load(open(f"wikidata5m_embed_transe/output/dynamic_rel_names.json"))
entity_names_list = json.load(open(f"wikidata5m_embed_transe/output/entity_names_all_0.json"))
prop_count = len(relation_names_list)
vector_dimension = 100
# operators
operator_lhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
operator_rhs = ComplexDiagonalDynamicOperator(vector_dimension, prop_count)
comparator = DotComparator()
cos_comparator = CosComparator()
with h5py.File(f"wikidata5m_embed_transe/output/model/model.v100.h5", "r") as hf:
    operator_state_dict_lhs = {
        "transition": torch.from_numpy(hf["model/relations/0/operator/lhs/translations"][...]),
    }


entity_to_index = {}
for i, entity in enumerate(entity_names_list):
    entity_to_index[entity] = i
    

rel_index = {}
for i, rel in enumerate(relation_names_list):
    rel_index[rel] = i

In [None]:
#r s o
def transe_score(r, s, o):
    return np.linalg.norm(r+s-o)

In [None]:
def link_predict_transe(node1, label, candidates, topn = 25):
    nodes = []
    scores = []
    for cand in candidates:
        emb_s = torch.tensor([float(x) for x in embeddings.node2[entity_to_index[node1]].split(',')])
        emb_o = torch.tensor([float(x) for x in embeddings.node2[entity_to_index[cand]].split(',')])
        score = transe_score(operator_state_dict_lhs['transition'][rel_index[label]],emb_s, emb_o)
        scores.append(float(score))
        nodes.append(cand)
    df_temp = pd.DataFrame(list(zip(nodes, scores)), columns = ['node1', 'score'])
    df_temp = df_temp.sort_values(by = 'score', ascending = False).reset_index(drop = True).head(topn)
    return df_temp

In [None]:
#candidates = embeddings.sample(500).node1

df = link_predict_transe('Q170564', 'P161', candidates, topn = 5)

df

In [None]:
# do link prediction, get outputting hit rates
results = [0,0,0,0,0]
df = pd.read_csv('wikidata/wikidata5m_transductive_test.tsv', sep = '\t')
for i in range(0, len(df)):
    cand_to_use = []
    if df['node2'][i] not in candidates:
        cand_to_use = list(candidates)[0:499].copy()
        cand_to_use.append(df['node2'][i])
    else:
        cand_to_use = candidates.copy()
    #print(df['node1'][i], df['label'][i])
    try:
        temp = link_predict_transe(df['node1'][i], df['label'][i], cand_to_use, topn = 1000)
    except:
        continue
    print(temp)
    #print(df['node1'][i], df['label'][i], df.node2[i])
    #print(temp.node1[0])
    if temp.node1[0] == df.node2[i]:
        results[0] += 1
        
    if df.node2[i] in list(temp.node1[0:3]):
        results[1] += 1
        
        
    if df.node2[i] in list(temp.node1[0:10]):
        results[2] += 1
    if df.node2[i] in list(temp.node1[0:100]):
        results[3] += 1
        
    if df.node2[i] in list(temp.node1[0:1000]):
        results[4] += 1

In [None]:
results

 Next steps
 
 
1. Generate 500 random answer from all node2 in testing set. For each link prediction, mix the correct answer into the candidates. Record hit rates

2. Generate 100 random answer from all node2 in testing set. For each link prediction, mix the correct answer into the candidates. Record hit rates. Do this 10 times

3. For each link prediction, use all the correct type of candidates from the testing set as candidates. (For exmaple, if a correct answer is US. Then we use all countries as candidates). Record for each property type

KGTK graph-embeddings parameters:

DistMult:
kgtk graph-embeddings -op DistMult -ot kgtk --batch_size 128 --num_epochs 200 --learning_rate 0.003 --dimension 200 --retain_temporary_data True -T kgtk_emb/dismult1 -i wikidata/wikidata5m_transductive_train.tsv -o kgtk_emb/dismult1/vector.tsv --log kgtk_emb/dismult1/ge.log.txt

TransE:
kgtk graph-embeddings -op TransE -ot kgtk --batch_size 1024 --num_epochs 1500 --learning_rate 0.0001 --dimension 1000 --retain_temporary_data True -T kgtk_emb/transe2 -i wikidata/wikidata5m_transductive_train.tsv -o kgtk_emb/transe2/vector.tsv --log kgtk_emb/transe2/ge.log.txt


ComplEx:
kgtk graph-embeddings -op ComplEx -ot kgtk --batch_size 1000 --num_epochs 200 --learning_rate 0.003 --dimension 200 --retain_temporary_data True -T kgtk_emb/complex2-i wikidata/wikidata5m_transductive_train.tsv -o kgtk_emb/complex2/vector.tsv --log kgtk_emb/complex2/ge.log.txt


In [1]:
pwd

'/nas/home/shicuoxi'