<a href="https://colab.research.google.com/github/unt-iialab/info5731_spring2021/blob/main/covid-19/covid_19_kg_mining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Preliminaries

In [11]:
# If using Google Colab run this cell 

# select tensorflow version for colab 
%tensorflow_version 1.x

In [12]:
import tensorflow as tf 

print('TensorFlow  version: {}'.format(tf.__version__))

# Get the GPU name
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

TensorFlow  version: 1.15.2
Found GPU at: /device:GPU:0


In [13]:
%%capture 
# Install AmpliGraph library
! pip install ampligraph

# Required to visualize embeddings with tensorboard projector, comment out if not required!
! pip install --user tensorboard

# Required to plot text on embedding clusters, comment out if not required!
! pip install --user git+https://github.com/Phlya/adjustText

In [14]:
# All imports used in this tutorial 
%tensorflow_version 1.x
import ampligraph
import numpy as np
import pandas as pd
import tensorflow as tf
from ampligraph.datasets import load_fb15k_237
from ampligraph.evaluation import train_test_split_no_unseen, evaluate_performance, mr_score, mrr_score, hits_at_n_score
from ampligraph.discovery import query_topn, discover_facts, find_clusters
from ampligraph.latent_features import TransE, ComplEx, HolE, DistMult, ConvE, ConvKB
from ampligraph.utils import save_model, restore_model

def display_aggregate_metrics(ranks):
    print('Mean Rank:', mr_score(ranks)) 
    print('Mean Reciprocal Rank:', mrr_score(ranks)) 
    print('Hits@1:', hits_at_n_score(ranks, 1))
    print('Hits@10:', hits_at_n_score(ranks, 10))
    print('Hits@100:', hits_at_n_score(ranks, 100))

print('Ampligraph version: {}'.format(ampligraph.__version__))

Ampligraph version: 1.3.2


# 2. Loading a Knowledge Graph dataset

In [15]:
from ampligraph.datasets import load_fb15k_237, load_wn18rr, load_yago3_10

In [16]:
import pandas as pd

URL = 'https://ampgraphenc.s3-eu-west-1.amazonaws.com/datasets/freebase-237-merged-and-remapped.csv'
dataset = pd.read_csv(URL, header=None)
dataset.columns = ['subject', 'predicate', 'object']
dataset.head(5)

Unnamed: 0,subject,predicate,object
0,"queens college, city university of new york",/education/educational_institution/students_gr...,carol leifer
1,digital equipment corporation,/business/business_operation/industry,computer hardware
2,/m/0drtv8,/award/award_ceremony/awards_presented./award/...,laurence mark
3,the departed,/award/award_winning_work/awards_won./award/aw...,leonardo dicaprio
4,marilyn manson,/people/person/profession,actor


In [17]:
print('Total triples in the KG:', dataset.shape)

Total triples in the KG: (310079, 3)


## 2.1 Create training, validation and test splits

In [18]:
from ampligraph.evaluation import train_test_split_no_unseen
# get the validation set of size 500
test_train, X_valid = train_test_split_no_unseen(dataset.values, 500, seed=0)

# get the test set of size 1000 from the remaining triples
X_train, X_test = train_test_split_no_unseen(test_train, 1000, seed=0)

print('Total triples:', dataset.shape)
print('Size of train:', X_train.shape)
print('Size of valid:', X_valid.shape)
print('Size of test:', X_test.shape)

Total triples: (310079, 3)
Size of train: (308579, 3)
Size of valid: (500, 3)
Size of test: (1000, 3)


# 3. Model Training

In [19]:
from ampligraph.latent_features import TransE

# Train a KGE model
model = TransE(k=300, 
               epochs=100, 
               eta=1, 
               loss='multiclass_nll', 
               initializer='xavier', initializer_params={'uniform': False},
               regularizer='LP', regularizer_params= {'lambda': 0.001, 'p': 3},
               optimizer= 'adam', optimizer_params= {'lr': 0.0001}, 
               seed= 0, batches_count= 10, verbose=True)

model.fit(X_train)
# ----------------------
# Evaluate: 
# Filtered evaluation with ranking strategy assigning worst rank to break ties

from ampligraph.utils import save_model, restore_model
save_model(model, 'TransE.pkl')
model = restore_model('TransE.pkl')

# create the filter 
X_filter = np.concatenate([X_train, X_valid, X_test], 0)

# compute ranks
ranks = evaluate_performance(X_test, 
                             model=model, 
                             filter_triples=X_filter)

# ranks are computed per triple
print('Test set:', X_test.shape)
print('Size of ranks:', ranks.shape)

# Aggregate metrics show the aggregate performance of the model on the test set using a single number
display_aggregate_metrics(ranks)
# ----------------------

Average Loss:   0.057714: 100%|██████████| 100/100 [00:14<00:00,  6.69epoch/s]
100%|██████████| 1000/1000 [00:05<00:00, 176.01it/s]


Test set: (1000, 3)
Size of ranks: (1000, 2)
Mean Rank: 671.9875
Mean Reciprocal Rank: 0.17562707148570464
Hits@1: 0.109
Hits@10: 0.304
Hits@100: 0.568


# 4. Knowledge Discovery

## 4.1 Triple completion

``` 
    <head, relation, ?> 
    <head, ?,        tail>
    <?,    relation, tail>
```

In [20]:
# Predict tail

from ampligraph.discovery import query_topn

# restore the previously saved model to save time
model = restore_model('TransE.pkl')

triples, scores = query_topn(model, top_n=10, 
                             head='missy elliott', 
                             relation='/people/person/profession', 
                             tail=None, 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -9.816028594970703 	 ['missy elliott' '/people/person/profession' 'pianist'] 
Score: -9.915382385253906 	 ['missy elliott' '/people/person/profession' 'songwriter'] 
Score: -10.31920051574707 	 ['missy elliott' '/people/person/profession' 'record producer'] 
Score: -10.337419509887695 	 ['missy elliott' '/people/person/profession' 'musician'] 
Score: -10.363887786865234 	 ['missy elliott' '/people/person/profession' 'singer-songwriter'] 
Score: -10.407577514648438 	 ['missy elliott' '/people/person/profession' 'presenter'] 
Score: -10.422386169433594 	 ['missy elliott' '/people/person/profession' 'bandleader'] 
Score: -10.45327377319336 	 ['missy elliott' '/people/person/profession' 'model'] 
Score: -10.494043350219727 	 ['missy elliott' '/people/person/profession' 'poet'] 
Score: -10.511186599731445 	 ['missy elliott' '/people/person/profession' 'composer'] 


In [21]:
# Predict relation

triples, scores = query_topn(model, top_n=10, 
                             head='the departed', 
                             relation=None, 
                             tail='/m/086k8', 
                             ents_to_consider=None, 
                             rels_to_consider=None)

for triple, score in zip(triples, scores):
    print('Score: {} \t {} '.format(score, triple))

Score: -7.314762592315674 	 ['the departed' '/film/film/production_companies' '/m/086k8'] 
Score: -7.363862991333008 	 ['the departed'
 '/award/award_winning_work/awards_won./award/award_honor/award_winner'
 '/m/086k8'] 
Score: -7.653811454772949 	 ['the departed' '/education/educational_institution/campuses' '/m/086k8'] 
Score: -7.687613487243652 	 ['the departed' '/location/hud_county_place/place' '/m/086k8'] 
Score: -7.924491882324219 	 ['the departed'
 '/education/educational_institution_campus/educational_institution'
 '/m/086k8'] 
Score: -8.341957092285156 	 ['the departed' '/film/film/produced_by' '/m/086k8'] 
Score: -8.381094932556152 	 ['the departed'
 '/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for'
 '/m/086k8'] 
Score: -8.501145362854004 	 ['the departed' '/film/film/written_by' '/m/086k8'] 
Score: -8.549186706542969 	 ['the departed'
 '/base/popstra/celebrity/breakup./base/popstra/breakup/participant'
 '/m/086k8'] 
Score: -8.555376052856