In [1]:
%load_ext autoreload
%autoreload 2
import os
import sys
import time
import csv

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pykeen
import torch

from pykeen.pipeline import pipeline
from ogb.linkproppred import LinkPropPredDataset


In [2]:
dataset = 'OGBBioKG'
mapping_dir = '/home/gebhart/.data/pykeen/datasets/ogbbiokg/ogbl_biokg/mapping'

In [3]:
df = pd.read_csv(os.path.join(mapping_dir,'relidx2relname.csv.gz'), compression='gzip')

In [4]:
df

Unnamed: 0,rel idx,rel name
0,0,disease-protein
1,1,drug-disease
2,2,drug-drug_acquired_metabolic_disease
3,3,drug-drug_bacterial_infectious_disease
4,4,drug-drug_benign_neoplasm
5,5,drug-drug_cancer
6,6,drug-drug_cardiovascular_system_disease
7,7,drug-drug_chromosomal_disease
8,8,drug-drug_cognitive_disorder
9,9,drug-drug_cryptorchidism


In [5]:
relnames = np.loadtxt(os.path.join(mapping_dir,'relidx2relname.csv.gz'), delimiter=',', skiprows=1, usecols=1, dtype=np.str)
relnames.shape

(51,)

In [6]:
dset = LinkPropPredDataset(name='ogbl-biokg', root='/home/gebhart/.data/pykeen/datasets/ogbbiokg')

In [7]:
dset.graph['num_nodes_dict']

{'disease': 10687,
 'drug': 10533,
 'function': 45085,
 'protein': 17499,
 'sideeffect': 9969}

In [8]:
edge_split = dset.get_edge_split()

In [9]:
x = edge_split['train']
x

{'head_type': ['disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',
  'disease',

In [10]:
x.keys()

dict_keys(['head_type', 'head', 'relation', 'tail_type', 'tail'])

In [11]:
from ogb.linkproppred import Evaluator

evaluator = Evaluator(name = 'ogbl-biokg')
print(evaluator.expected_input_format) 
print(evaluator.expected_output_format)

==== Expected input format of Evaluator for ogbl-biokg
{'y_pred_pos': y_pred_pos, 'y_pred_neg': y_pred_neg}
- y_pred_pos: numpy ndarray or torch tensor of shape (num_edge, ). Torch tensor on GPU is recommended for efficiency.
- y_pred_neg: numpy ndarray or torch tensor of shape (num_edge, num_nodes_neg). Torch tensor on GPU is recommended for efficiency.
y_pred_pos is the predicted scores for positive edges.
y_pred_neg is the predicted scores for negative edges. It needs to be a 2d matrix.
y_pred_pos[i] is ranked among y_pred_neg[i].
Note: As the evaluation metric is ranking-based, the predicted scores need to be different for different edges.
==== Expected output format of Evaluator for ogbl-biokg
{'hits@1_list': hits@1_list, 'hits@3_list': hits@3_list, 
'hits@10_list': hits@10_list, 'mrr_list': mrr_list}
- mrr_list (list of float): list of scores for calculating MRR 
- hits@1_list (list of float): list of scores for calculating Hits@1 
- hits@3_list (list of float): list of scores to

In [12]:
ds = pykeen.datasets.get_dataset(dataset=dataset)

training = ds.training.mapped_triples
training

tensor([[    0,     0, 80530],
        [    1,     0, 82314],
        [    2,     0, 66642],
        ...,
        [83803,    38, 64632],
        [83803,    38, 64911],
        [83803,    38, 64912]])

In [14]:
ds.entity_to_id
id_to_entity = {v:k for k,v in ds.entity_to_id.items()}
id_to_entity

{0: 'disease:0',
 1: 'disease:1',
 2: 'disease:10',
 3: 'disease:100',
 4: 'disease:1000',
 5: 'disease:10000',
 6: 'disease:10001',
 7: 'disease:10002',
 8: 'disease:10003',
 9: 'disease:10004',
 10: 'disease:10005',
 11: 'disease:10006',
 12: 'disease:10007',
 13: 'disease:10008',
 14: 'disease:10009',
 15: 'disease:1001',
 16: 'disease:10010',
 17: 'disease:10011',
 18: 'disease:10012',
 19: 'disease:10013',
 20: 'disease:10014',
 21: 'disease:10015',
 22: 'disease:10016',
 23: 'disease:10017',
 24: 'disease:10018',
 25: 'disease:10019',
 26: 'disease:1002',
 27: 'disease:10020',
 28: 'disease:10021',
 29: 'disease:10022',
 30: 'disease:10023',
 31: 'disease:10024',
 32: 'disease:10025',
 33: 'disease:10026',
 34: 'disease:10027',
 35: 'disease:10028',
 36: 'disease:10029',
 37: 'disease:1003',
 38: 'disease:10030',
 39: 'disease:10031',
 40: 'disease:10032',
 41: 'disease:10033',
 42: 'disease:10034',
 43: 'disease:10035',
 44: 'disease:10036',
 45: 'disease:10037',
 46: 'disease:1

In [29]:
mapped = []
for i in range(training.shape[0]):
    t = training[i]
    t0 = id_to_entity[t[0].item()]
    t2 = id_to_entity[t[2].item()]
    mapped.append([t0[:t0.find(':')], str(t[1].item()), t2[:t2.find(':')]])
mapped = np.array(mapped, dtype=np.str)

In [30]:
mapped[np.isin(mapped[:,1], [str(j) for j in range(2,40)])]

array([['drug', '29', 'drug'],
       ['drug', '29', 'drug'],
       ['drug', '29', 'drug'],
       ...,
       ['protein', '38', 'function'],
       ['protein', '38', 'function'],
       ['protein', '38', 'function']], dtype='<U10')

In [31]:
mapped[mapped[:,1] == '38']

array([['protein', '38', 'function'],
       ['protein', '38', 'function'],
       ['protein', '38', 'function'],
       ...,
       ['protein', '38', 'function'],
       ['protein', '38', 'function'],
       ['protein', '38', 'function']], dtype='<U10')

In [38]:
np.unique(mapped[(mapped[:,0] == 'drug') & (mapped[:,2] == 'drug')][:,1])

array(['10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '2',
       '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '3',
       '30', '31', '32', '33', '34', '4', '45', '47', '48', '49', '5',
       '50', '6', '7', '8', '9'], dtype='<U10')

In [39]:
ds.relation_to_id

{'0': 0,
 '1': 1,
 '10': 2,
 '11': 3,
 '12': 4,
 '13': 5,
 '14': 6,
 '15': 7,
 '16': 8,
 '17': 9,
 '18': 10,
 '19': 11,
 '2': 12,
 '20': 13,
 '21': 14,
 '22': 15,
 '23': 16,
 '24': 17,
 '25': 18,
 '26': 19,
 '27': 20,
 '28': 21,
 '29': 22,
 '3': 23,
 '30': 24,
 '31': 25,
 '32': 26,
 '33': 27,
 '34': 28,
 '35': 29,
 '36': 30,
 '37': 31,
 '38': 32,
 '39': 33,
 '4': 34,
 '40': 35,
 '41': 36,
 '42': 37,
 '43': 38,
 '44': 39,
 '45': 40,
 '46': 41,
 '47': 42,
 '48': 43,
 '49': 44,
 '5': 45,
 '50': 46,
 '6': 47,
 '7': 48,
 '8': 49,
 '9': 50}