In [None]:
# import sys
# !{sys.executable} -m pip install pykeen

In [1]:
import matplotlib # For graph plotting
from pykeen.pipeline import pipeline

# Import pykeen library
import pykeen
from pykeen.models.predict import get_head_prediction_df
from pykeen.models.predict import get_all_prediction_df
from pykeen.models.predict import get_tail_prediction_df
from pykeen.models.predict import get_relation_prediction_df

# 'Nations' dataset ships & installs with pykeen
# In Windows, it is found under the python directory, for example:
# C:\Users\USER\AppData\Roaming\Python\Python38\site-packages\pykeen\datasets\nations
from pykeen.datasets import Nations

In [None]:
#from pykeen.pipeline import pipeline
dataset = Nations()
training_triples_factory = dataset.training

In [21]:
dataset

Nations(training_path="C:\Users\user\AppData\Local\Programs\Python\Python39\Lib\site-packages\pykeen\datasets\nations\train.txt", testing_path="C:\Users\user\AppData\Local\Programs\Python\Python39\Lib\site-packages\pykeen\datasets\nations\test.txt", validation_path="C:\Users\user\AppData\Local\Programs\Python\Python39\Lib\site-packages\pykeen\datasets\nations\valid.txt")

In [24]:
with open(r'C:\Users\user\AppData\Local\Programs\Python\Python39\Lib\site-packages\pykeen\datasets\nations\test.txt', ) as f:
    test = f.readlines()
test

['poland\tngoorgs3\tussr\n',
 'india\tintergovorgs\tisrael\n',
 'israel\tcommonbloc1\tcuba\n',
 'usa\tindependence\tchina\n',
 'burma\treldiplomacy\tussr\n',
 'uk\tpprotests\tussr\n',
 'indonesia\tmilitaryactions\tuk\n',
 'poland\tweightedunvote\tnetherlands\n',
 'china\tintergovorgs3\tindia\n',
 'burma\trelintergovorgs\tusa\n',
 'israel\tngoorgs3\tbrazil\n',
 'india\tintergovorgs\tegypt\n',
 'netherlands\tngo\tindia\n',
 'ussr\tbooktranslations\tusa\n',
 'usa\tmilitaryalliance\tnetherlands\n',
 'india\tcommonbloc1\tpoland\n',
 'cuba\treldiplomacy\tegypt\n',
 'usa\tweightedunvote\tisrael\n',
 'cuba\tngoorgs3\tisrael\n',
 'china\tembassy\tburma\n',
 'netherlands\tembassy\tindia\n',
 'egypt\tembassy\tuk\n',
 'india\trelexports\tussr\n',
 'indonesia\taccusation\tuk\n',
 'egypt\tofficialvisits\tussr\n',
 'brazil\tintergovorgs\tcuba\n',
 'china\trelexports\tcuba\n',
 'ussr\trelintergovorgs\tbrazil\n',
 'usa\tngoorgs3\tnetherlands\n',
 'jordan\treltreaties\tindonesia\n',
 'indonesia\tviolent

In [2]:
# Pick a model, TransE in our case
from pykeen.models import TransE
model = TransE(triples_factory=training_triples_factory)


# Pick an optimizer from Torch
# Adam: Adaptive Moment Estimation is an algorithm for optimization technique for gradient descent
from torch.optim import Adam
optimizer = Adam(params=model.get_grad_params())

No random seed is specified. This may lead to non-reproducible results.


* Pick a training approach (sLCWA or LCWA)  
* Whenever we have a knowledge graph we need to make certain assumptions to draw inferences from it.


* Closed World Assumption is one such assumption. It assumes that if a link is not present between two entities, then that link is false or the probability of a relationship between these entities is always zero. 


* We can immediately see problems with this assumption.   
* Once we assume this, we can’t predict any new links in the knowledge graph.   
* Collecting Real-world Data is a challenging task and lots of relationships are not captured in the knowledge graph. This assumption turns all the missing data into false values.  


* Local Closed World Assumption(LCWA) on the other hand, solves this problem by specifying a predicate over areas that says whether the area of the knowledge graph is complete or not.   
* Stochastic Local Closed World Assumption(sLCWA) is a stochastic version of the LCWA.  

In [4]:
from pykeen.training import SLCWATrainingLoop
training_loop = SLCWATrainingLoop(
    model=model,
    triples_factory=training_triples_factory,
    optimizer=optimizer,
)

In [5]:
# ## Experiment training with different epochs and batch sizes

# Train it now
_ = training_loop.train(
    triples_factory=training_triples_factory,
    num_epochs=5,
    batch_size=256,
)

Training epochs on cuda:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

In [6]:
# Pick an evaluator
# 'Nations' dataset also has a test set that can be used for evaluation  
from pykeen.evaluation import RankBasedEvaluator
evaluator = RankBasedEvaluator()

# Get triples to test
mapped_triples = dataset.testing.mapped_triples

In [7]:
## Evaluate the results obtained by printing and inspecting it


evaluation_results = evaluator.evaluate(model, mapped_triples, batch_size=256, additional_filter_triples=[dataset.training.mapped_triples])
print(evaluation_results)

Evaluating on cuda:   0%|          | 0.00/201 [00:00<?, ?triple/s]

RankBasedMetricResults(arithmetic_mean_rank={'head': {'realistic': 5.134328358208955, 'optimistic': 5.134328358208955, 'pessimistic': 5.134328358208955}, 'tail': {'realistic': 5.288557213930348, 'optimistic': 5.288557213930348, 'pessimistic': 5.288557213930348}, 'both': {'realistic': 5.211442786069652, 'optimistic': 5.211442786069652, 'pessimistic': 5.211442786069652}}, geometric_mean_rank={'head': {'realistic': 4.366388522795558, 'optimistic': 4.366388522795558, 'pessimistic': 4.366388522795558}, 'tail': {'realistic': 4.581606279479136, 'optimistic': 4.581606279479136, 'pessimistic': 4.581606279479136}, 'both': {'realistic': 4.472703106029479, 'optimistic': 4.472703106029479, 'pessimistic': 4.472703106029479}}, median_rank={'head': {'realistic': 5.0, 'optimistic': 5.0, 'pessimistic': 5.0}, 'tail': {'realistic': 5.0, 'optimistic': 5.0, 'pessimistic': 5.0}, 'both': {'realistic': 5.0, 'optimistic': 5.0, 'pessimistic': 5.0}}, harmonic_mean_rank={'head': {'realistic': 3.723962162527889, 'o

In [8]:
## PyKeen pipeline provides a high-level entry point to access the models. It is an alternative to the above


result = pipeline(
   training= dataset.training,
   testing=  dataset.testing ,
   model = 'TransE',
   model_kwargs=dict(embedding_dim=2),
   random_seed=1,
   device='cpu',    
)

Training epochs on cpu:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cpu:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cpu:   0%|          | 0/7 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Currently automatic memory optimization only supports GPUs, but you're using a CPU. Therefore, the batch_size will be set to the default value.
INFO:pykeen.evaluation.evaluator:No evaluation batch_size provided. Setting batch_size to '32'.


Evaluating on cpu:   0%|          | 0.00/201 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.08s seconds


In [13]:
# ## Explore 'RotatE' instead of 'TransE' as a new model and note your observations


# First comes the 'TransE'
model= result.model

# Predict the tail
df = get_tail_prediction_df(result.model, 'brazil', 'accusation', triples_factory=result.training)
display(df)

Unnamed: 0,tail_id,tail_label,score,in_training
12,12,usa,-4.593426,False
5,5,india,-4.795978,False
11,11,uk,-4.809861,False
9,9,netherlands,-4.810874,False
10,10,poland,-4.901611,False
13,13,ussr,-4.947122,False
0,0,brazil,-4.955655,False
2,2,china,-5.020988,False
6,6,indonesia,-5.076213,False
4,4,egypt,-5.145313,False


In [14]:
# train new 'RotatE' model
result = pipeline(
    dataset='Nations',
    model='RotatE', epochs= 5
)
model= result.model

# Then predict the tail
df = get_tail_prediction_df(result.model, 'brazil', 'accusation', triples_factory=result.training)
display(df)



Training epochs on cuda:   0%|          | 0/5 [00:00<?, ?epoch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

Training batches on cuda:   0%|          | 0/7 [00:00<?, ?batch/s]

INFO:pykeen.evaluation.evaluator:Starting batch_size search for evaluation now...
INFO:pykeen.evaluation.evaluator:Concluded batch_size search with batch_size=201.


Evaluating on cuda:   0%|          | 0.00/201 [00:00<?, ?triple/s]

INFO:pykeen.evaluation.evaluator:Evaluation took 0.09s seconds


Unnamed: 0,tail_id,tail_label,score,in_training
12,12,usa,-4.470839,False
11,11,uk,-4.592726,False
13,13,ussr,-4.75182,False
10,10,poland,-4.886285,False
9,9,netherlands,-4.959569,False
2,2,china,-4.986444,False
7,7,israel,-4.991738,False
0,0,brazil,-5.0306,False
4,4,egypt,-5.067224,False
1,1,burma,-5.082107,False


In [15]:
df = get_head_prediction_df(result.model, 'accusation', 'brazil', triples_factory=result.training)
display(df)


Unnamed: 0,head_id,head_label,score,in_training
12,12,usa,-4.594889,False
11,11,uk,-4.79248,False
9,9,netherlands,-4.906074,False
10,10,poland,-5.026696,False
0,0,brazil,-5.0306,False
7,7,israel,-5.036422,False
2,2,china,-5.047518,False
13,13,ussr,-5.072237,False
5,5,india,-5.073205,False
6,6,indonesia,-5.121127,False


In [16]:
df = get_relation_prediction_df(result.model, 'brazil', 'uk', triples_factory=result.training)
display(df)



Unnamed: 0,relation_id,relation_label,score,in_training
2,2,attackembassy,-4.432677,False
46,46,timesincewar,-4.530785,False
45,45,timesinceally,-4.551143,True
8,8,commonbloc2,-4.551418,True
49,49,treaties,-4.554054,False
1,1,aidenemy,-4.567594,False
16,16,expeldiplomats,-4.571178,False
38,38,relintergovorgs,-4.584063,True
40,40,relstudents,-4.584818,False
11,11,duration,-4.592298,False


In [17]:
# Get scores for *all* triples
df = get_all_prediction_df(model, triples_factory=result.training)
display(df)



scoring:   0%|          | 0.00/770 [00:00<?, ?batch/s]

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,11,uk,32,relbooktranslations,11,uk,-3.828930,False
1,12,usa,32,relbooktranslations,12,usa,-3.948582,False
2,11,uk,34,releconomicaid,11,uk,-4.016978,False
3,11,uk,10,dependent,12,usa,-4.025353,False
4,11,uk,37,relexports,11,uk,-4.042844,False
...,...,...,...,...,...,...,...,...
10775,1,burma,36,relexportbooks,1,burma,-5.927726,False
10776,1,burma,8,commonbloc2,1,burma,-5.933246,False
10777,8,jordan,42,reltreaties,1,burma,-5.958352,False
10778,1,burma,54,weightedunvote,1,burma,-6.039948,False


In [18]:
# Get scores for top 15 triples
top_df = get_all_prediction_df(model, k=15, triples_factory=result.training)
display(top_df)



scoring:   0%|          | 0.00/770 [00:00<?, ?batch/s]

Unnamed: 0,head_id,head_label,relation_id,relation_label,tail_id,tail_label,score,in_training
0,11,uk,32,relbooktranslations,11,uk,-3.82893,False
1,12,usa,32,relbooktranslations,12,usa,-3.948582,False
2,11,uk,34,releconomicaid,11,uk,-4.016978,False
3,11,uk,10,dependent,12,usa,-4.025353,False
4,11,uk,37,relexports,11,uk,-4.042844,False
5,12,usa,27,ngo,11,uk,-4.043432,True
6,12,usa,25,negativebehavior,12,usa,-4.077655,False
7,12,usa,17,exportbooks,12,usa,-4.078964,False
8,11,uk,19,independence,12,usa,-4.08471,True
9,12,usa,18,exports3,11,uk,-4.091799,False
