In [1]:
import numpy as np
import pandas as pd
import networkx as nx

In [2]:
# load entity2id
entity2id = {}
with open("data/Freebase/FB15k/entity2id.txt", 'r', encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    e, idx = line.strip().split('\t')
    entity2id[e] = int(idx)

In [3]:
# load relation2id
relation2id = {}
with open("data/Freebase/FB15k/relation2id.txt", 'r', encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    r, idx = line.strip().split('\t')
    relation2id[r] = int(idx)

In [4]:
# load type2id
type2id = {}
with open("data/Freebase/FB15kET/type2id.txt", 'r', encoding='utf-8') as f:
    lines = f.readlines()
for line in lines:
    t, idx = line.strip().split('\t')
    type2id[t] = int(idx)

In [5]:
# load training set of FB15k
train_triplet = []
with open('data/Freebase/FB15k/freebase_mtr100_mte100-train.txt', 'r') as f:
    for line in f.readlines():
        h, l, t = line.strip().split("\t")
        train_triplet.append((entity2id[h],relation2id[l],entity2id[t]))

In [6]:
# load training set of FB15kET
train_e2t = {}
pair_train = 0 
with open("data/Freebase/FB15kET/FB15k_Entity_Type_train.txt", 'r') as f:
    for line in f.readlines():
        pair_train +=1
        h, t = line.strip().split("\t")
        if entity2id[h] not in train_e2t:
            train_e2t[entity2id[h]] = []
        if type2id[t] not in train_e2t[entity2id[h]]:
            train_e2t[entity2id[h]].append(type2id[t])

In [7]:
# load validation set of FB15kET
dev_e2t = {}
pair_dev =0
with open("data/Freebase/FB15kET/FB15k_Entity_Type_valid_clean.txt", 'r') as f:
    for line in f.readlines():
        pair_dev+=1
        h, t = line.strip().split("\t")
        if entity2id[h] not in dev_e2t:
            dev_e2t[entity2id[h]] = []
        if type2id[t] not in dev_e2t[entity2id[h]]:
            dev_e2t[entity2id[h]].append(type2id[t])

In [8]:
# load test set of FB15kET
test_e2t = {}
pair_test =0
with open("data/Freebase/FB15kET/FB15k_Entity_Type_test_clean.txt", 'r') as f:
    for line in f.readlines():
        pair_test+=1
        h, t = line.strip().split("\t")
        if entity2id[h] not in test_e2t:
            test_e2t[entity2id[h]] = []
        test_e2t[entity2id[h]].append(type2id[t])

In [9]:
# inverse xxx2id
id2entity = {entity2id[entity]:entity for entity in entity2id}
id2type = {type2id[type_]: type_ for type_ in type2id}
id2relation = {relation2id[relation]: relation for relation in relation2id}

In [10]:
# add self-loop relation
self_rid = len(relation2id)
self_triplet = []
for entity in train_e2t:
    self_triplet.append((entity, self_rid, entity))

In [11]:
# count dictionary A, realize with p_h and p_t
p_h = {}
p_t = {}
self_r = len(relation2id)
for triplet in train_triplet+self_triplet:
    h, r, t = triplet
    if h in train_e2t and t in train_e2t:              # ensure the knowledge is known
        for type_h in train_e2t[h]:
            for type_t in train_e2t[t]:
                if (type_h, r) not in p_h:
                    p_h[(type_h, r)] = {}
                if type_t not in p_h[(type_h, r)]:
                    p_h[(type_h, r)][type_t] = 1
                else:
                    p_h[(type_h, r)][type_t] += 1
                    
                if (r, type_t) not in p_t:
                    p_t[(r, type_t)] = {}
                if type_h not in p_t[(r, type_t)]:
                    p_t[(r, type_t)][type_h] = 1
                else:
                    p_t[(r, type_t)][type_h] += 1

In [12]:
# count dictionary Q_e and Q'_e
h_count = {}
t_count = {}
for h_r in p_h:
    type_h, r = h_r
    count = sum(list(p_h[h_r].values()))
    h_count[h_r] = count
    p_h[h_r] = pd.DataFrame(list(p_h[h_r].values()),index=p_h[h_r].keys())

for r_t in p_t:
    r, type_t = r_t
    count = sum(list(p_t[r_t].values()))
    t_count[r_t] = count
    p_t[r_t] = pd.DataFrame(list(p_t[r_t].values()),index=p_t[r_t].keys())

In [13]:
# evaluation
def evaluation(data_name='dev'):
    mr = mrr = hit10 = hit3 =hit1 = 0
    fmr = fmrr = fhit10 = fhit3 = fhit1 = 0
    if data_name == 'dev':
        data_set = dev_e2t.copy()
    elif data_name == 'test':
        data_set = test_e2t.copy()
    for entity_ in data_set:
        type_arg = np.argsort(-entity2type[entity_])
        test_rank_list = []
        train_rank_list = []
        valid_rank_list = []
        if entity_ in test_e2t:
            for type_label in test_e2t[entity_]:
                rank = (type_arg==type_label).nonzero()[0].item()+1
                test_rank_list.append(rank)
        if entity_ in dev_e2t:
            for type_label in dev_e2t[entity_]:
                rank = (type_arg==type_label).nonzero()[0].item()+1
                valid_rank_list.append(rank)
        if entity_ in train_e2t:
            for type_label in train_e2t[entity_]:
                rank = (type_arg==type_label).nonzero()[0].item()+1
                train_rank_list.append(rank)
        rank_list = train_rank_list + test_rank_list + valid_rank_list
        rank_list.sort()
        
        if data_name == 'dev':
            target_rank_list = valid_rank_list.copy()
        elif data_name == 'test':
            target_rank_list = test_rank_list.copy()
        
        for i, rank in enumerate(target_rank_list):
            #rank is ’raw‘ rank
            #raw-index is the rank of all correct
            #rank - raw_index is filt rank
            raw_index = rank_list.index(rank)
            frank = rank - raw_index

            #if rank == raw_index  frank shoud be 1
            if frank <= 0:
                frank = 1

            fmr += frank
            fmrr += 1.0/frank
            if frank <=10:
                fhit10 += 1
            if frank <=3:
                fhit3 += 1
            if frank <= 1:
                fhit1 += 1
    
    num_of_e2t = 0
    for i in data_set:
        num_of_e2t += len(data_set[i])
        
    return fmrr/num_of_e2t, fhit1/num_of_e2t, fhit3/num_of_e2t, fhit10/num_of_e2t

In [14]:
# initialize M entity-type
entity2type = np.zeros((len(entity2id), len(type2id)))
print("FB15k:",len(train_triplet),"triples")
print("FB15kET:",pair_train,"train pairs, ", pair_dev,"valid pairs,", pair_test, "test pairs")

FB15k: 483142 triples
FB15kET: 136618 train pairs,  15749 valid pairs, 15780 test pairs


In [15]:
omega = 3
for i, triplet in enumerate(train_triplet+self_triplet):
    h, r, t = triplet
    if r == self_rid:
        r_weight = omega
    else:
        r_weight = 1
    if h in train_e2t:
        normalize_h = len(train_e2t[h])
        for type_h in train_e2t[h]:
            if (type_h, r) in p_h:
                df = p_h[(type_h, r)]
                t_key = list(df.index)
                t_value = df.values.reshape(-1)
                entity2type[t][t_key] += t_value/h_count[(type_h, r)]/normalize_h * r_weight

    if t in train_e2t:
        normalize_t = len(train_e2t[t])
        for type_t in train_e2t[t]:
            if (r, type_t) in p_t:
                df = p_t[(r, type_t)]
                h_key = list(df.index)
                h_value = df.values.reshape(-1)
                entity2type[h][h_key] += h_value/t_count[(r, type_t)]/normalize_t * r_weight

    if (i+1)%10000 == 0:
        print(i+1,"triples have been calculated")

10000 triples have been calculated
20000 triples have been calculated
30000 triples have been calculated
40000 triples have been calculated
50000 triples have been calculated
60000 triples have been calculated
70000 triples have been calculated
80000 triples have been calculated
90000 triples have been calculated
100000 triples have been calculated
110000 triples have been calculated
120000 triples have been calculated
130000 triples have been calculated
140000 triples have been calculated
150000 triples have been calculated
160000 triples have been calculated
170000 triples have been calculated
180000 triples have been calculated
190000 triples have been calculated
200000 triples have been calculated
210000 triples have been calculated
220000 triples have been calculated
230000 triples have been calculated
240000 triples have been calculated
250000 triples have been calculated
260000 triples have been calculated
270000 triples have been calculated
280000 triples have been calculated
2

In [16]:
dev_evaluation = evaluation('dev')
test_evaluation = evaluation('test')
        
print("results of validation：",dev_evaluation)
print("results of test：",test_evaluation)

results of validation： (0.5659360550033553, 0.4707600482570322, 0.6088640548606261, 0.7573814210426059)
results of test： (0.5681619527719328, 0.470595690747782, 0.6145754119138149, 0.7641951837769329)


### Case Study

In [17]:
# build graph
G = nx.MultiDiGraph()
for triple in train_triplet:
    h, r, t = triple
    G.add_edge(h,t,key=r)

In [18]:
print("entity_id:",9670,",mid_id:",id2entity[9670],",entity: Raza Murad")   #'Raza Murad'
print(G.in_edges(9670))                 #in neighbor
print(G.out_edges(9670))                #out neighbor

entity_id: 9670 ,mid_id: /m/03d63lb ,entity: Raza Murad
[(6802, 9670)]
[(9670, 13605), (9670, 6802), (9670, 3492)]


In [19]:
print("entity_id:",6802,",mid_id:",id2entity[6802],",entity: Jodhaa Akbar")   #'Jodhaa Akbar'
print("entity_id:",13605,",mid_id:",id2entity[13605],",entity: India")   #'India'
print("entity_id:",3492,",mid_id:",id2entity[3492],",entity: actor")   #'actor'

entity_id: 6802 ,mid_id: /m/09fn1w ,entity: Jodhaa Akbar
entity_id: 13605 ,mid_id: /m/03rk0 ,entity: India
entity_id: 3492 ,mid_id: /m/02hrh1q ,entity: actor


In [20]:
print(G[6802][9670])
print(G[9670][13605])
print(G[9670][6802])
print(G[9670][3492])

{1196: {}}
{875: {}}
{669: {}}
{22: {}}


In [21]:
print("(Jodhaa Akbar,",id2relation[1196],",Raza Murad)")
print("(Raza Murad,",id2relation[669],",Jodhaa Akbar)")
print("(Raza Murad,",id2relation[875],",India)")
print("(Raza Murad,",id2relation[22],",actor)")

(Jodhaa Akbar, /film/film/starring./film/performance/actor ,Raza Murad)
(Raza Murad, /film/actor/film./film/performance/film ,Jodhaa Akbar)
(Raza Murad, /people/person/nationality ,India)
(Raza Murad, /people/person/profession ,actor)


In [22]:
# Jodhaa Akbar ETIs
for type_ in train_e2t[6802]:
    print(type_, id2type[type_])

845 /award/award_nominated_work
3718 /base/ovguide/bollywood_films
3364 /base/filmfareawards/topic
580 /film/film
1967 /media_common/netflix_title
841 /base/ovguide/topic
3827 /award/award_winning_work


In [23]:
# Raza Murad ETIs
print('train')
for type_ in train_e2t[9670]:
    print(type_,id2type[type_])
print('test')
for type_ in test_e2t[9670]:
    print(type_,id2type[type_])

train
2039 /people/person
3321 /award/award_nominee
3572 /film/actor
test
131 /influence/influence_node


In [24]:
# calculate the conditional probability of triple 
#(Jodhaa Akbar, /film/film/starring./film/performance/actor, -)
temp_array =np.zeros(len(type2id))
for type_ in train_e2t[6802]:
    query = (type_, 1196)
    for ans in range(len(type2id)):
        if query in h_count:
            if ans in p_h[query].index:
                temp_array[ans] += p_h[query].loc[ans].item()/h_count[query]

#### We see /people/person(2039), /film/actor(3572), /award/award_nominee(3321), /tv/tv_actor(130) ans so on.

In [25]:
print(np.argsort(-temp_array)[:10])

[2039 3572 3321  130  131 2019  736 3385 2948 2233]
