# Knowledge Graph Embedding Partitioning

In [1]:
import glob
import shutil
from os import listdir
from os.path import join

import matplotlib.pyplot as plt
import pandas as pd
from evaluation_framework.manager import FrameworkManager

from src.utils import *
from src.alignment import *

## Getting the data and getting it ready to be used

In [3]:
with open("links2016-04.txt", "r") as link_file:
    links = [l[:-1] if l[-1:] == "\n" else l for l in link_file]

get_all_data(links)

100%|██████████| 223M/223M [00:19<00:00, 11.3MiB/s] 


Decompressing: data\article_categories_en.ttl


100%|██████████| 42.8M/42.8M [00:03<00:00, 10.8MiB/s]


Decompressing: data\instance_types_en.ttl


100%|██████████| 134M/134M [00:20<00:00, 6.66MiB/s] 


Decompressing: data\instance_types_transitive_en.ttl


100%|██████████| 167M/167M [00:14<00:00, 11.1MiB/s] 


Decompressing: data\mappingbased_objects_en.ttl


100%|██████████| 42.3M/42.3M [00:03<00:00, 11.1MiB/s]


Decompressing: data\skos_categories_en.ttl


## Setting up jRDF2vec

In [7]:
download("https://github.com/dwslab/jRDF2Vec/blob/jars/jars/jrdf2vec-1.3-SNAPSHOT.jar?raw=true", "jrdf2vec.jar")

100%|██████████| 24.8M/24.8M [00:02<00:00, 9.27MiB/s]


In [2]:
!java -jar jrdf2vec.jar -checkInstallation

Using server port: 1808
06 Juni 2023 04:32:47 DEBUG [main] (RequestAddCookies.java:123) - CookieSpec selected: default
06 Juni 2023 04:32:47 DEBUG [main] (RequestAuthCache.java:77) - Auth cache not set in the context
06 Juni 2023 04:32:47 DEBUG [main] (PoolingHttpClientConnectionManager.java:267) - Connection request: [route: {}->http://127.0.0.1:1808][total available: 0; route allocated: 0 of 2; total allocated: 0 of 20]
06 Juni 2023 04:32:47 DEBUG [main] (PoolingHttpClientConnectionManager.java:312) - Connection leased: [id: 0][route: {}->http://127.0.0.1:1808][total available: 0; route allocated: 1 of 2; total allocated: 1 of 20]
06 Juni 2023 04:32:47 DEBUG [main] (MainClientExec.java:234) - Opening connection {}->http://127.0.0.1:1808
06 Juni 2023 04:32:47 DEBUG [main] (DefaultHttpClientConnectionOperator.java:139) - Connecting to /127.0.0.1:1808
 * Serving Flask app 'python_server'
 * Debug mode: off
06 Juni 2023 04:32:49 DEBUG [main] (DefaultHttpClientConnectionOperator.java:146)

## Running jRDF2vec
### Baseline of the complete graph
#### First the walks

In [2]:
!java -Xmx32g -jar jrdf2vec.jar -graph ./data/data_copewm/mappingbased-objects_lang=en.nq -onlyWalks -walkDirectory ./walks/walks_complete > ./logs/walk_log_complete.txt

#### and then the embedding

In [3]:
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory ./walks/walks_complete_nq > ./logs/embedding_log_complete.txt

### Horizontal partitions
#### First the walks

In [20]:
for i in range(0,10):
    os.mkdir("./walks/walks_horizontal/walks_horizontal_{0}".format(i))

In [3]:
for file in listdir("./data/data_horizontal/"):
    decompress(join("./data/data_horizontal/", file))

Decompressing: ./data/data_horizontal/chunk0.nq
Decompressing: ./data/data_horizontal/chunk1.nq
Decompressing: ./data/data_horizontal/chunk2.nq
Decompressing: ./data/data_horizontal/chunk3.nq
Decompressing: ./data/data_horizontal/chunk4.nq
Decompressing: ./data/data_horizontal/chunk5.nq
Decompressing: ./data/data_horizontal/chunk6.nq
Decompressing: ./data/data_horizontal/chunk7.nq
Decompressing: ./data/data_horizontal/chunk8.nq
Decompressing: ./data/data_horizontal/chunk9.nq


In [4]:
for i in range(0,10):
    !java -Xmx32g -jar jrdf2vec.jar -graph "./data/data_horizontal/chunk{i}.nq" -onlyWalks -walkDirectory "./walks/walks_horizontal/walks_horizontal_{i}" > "./logs/horizontal_logs/walk_log_horizontal_{i}.txt"

#### and then the embedding

In [2]:
for i in range(0,10):
    os.mkdir("./models/model_horizontal/model_horizontal_{0}".format(i))

In [24]:
# not possible to run in a loop without having them all run at the same time
nr = 9
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory "./walks/walks_horizontal/walks_horizontal_{nr}" > "./logs/horizontal_logs/embedding_log_horizontal_{nr}.txt"

In [25]:
for file in glob.glob("./walks/walks_horizontal/walks_horizontal_{0}/m*".format(nr)):
    shutil.move(file, "./models/model_horizontal/model_horizontal_{0}/".format(nr))
shutil.move("./walks/walks_horizontal/walks_horizontal_{0}/vectors.txt".format(nr), "./models/model_horizontal/model_horizontal_{0}/".format(nr))

'./models/model_horizontal/model_horizontal_9/vectors.txt'

In [49]:
for i in range(0,10):
    os.mkdir("./walks/walks_subject_{0}".format(i))

In [50]:
for file in listdir("./data/data_subject/"):
    decompress(join("./data/data_subject/", file))

Decompressing: ./data/data_subject/chunk0.nq
Decompressing: ./data/data_subject/chunk1.nq
Decompressing: ./data/data_subject/chunk2.nq
Decompressing: ./data/data_subject/chunk3.nq
Decompressing: ./data/data_subject/chunk4.nq
Decompressing: ./data/data_subject/chunk5.nq
Decompressing: ./data/data_subject/chunk6.nq
Decompressing: ./data/data_subject/chunk7.nq
Decompressing: ./data/data_subject/chunk8.nq
Decompressing: ./data/data_subject/chunk9.nq


In [51]:
for i in range(0,10):
    !java -Xmx32g -jar jrdf2vec.jar -graph "./data/data_subject/chunk{i}.nq" -onlyWalks -walkDirectory "./walks/walks_subject_{i}" > "./logs/walk_log_subject_{i}.txt"

#### and then the embeddings

In [52]:
for i in range(0,10):
    os.mkdir("./models/model_subject_{0}".format(i))

In [79]:
# not possible to run in a loop without having them all run at the same time
nr = 6
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory "./walks/walks_subject_{nr}" > "./logs/embedding_log_subject_{nr}.txt"

In [80]:
for file in glob.glob("./walks/walks_subject_{0}/m*".format(nr)):
    shutil.move(file, "./models/model_subject_{0}/".format(nr))
shutil.move("./walks/walks_subject_{0}/vectors.txt".format(nr), "./models/model_subject_{0}/".format(nr))

'./models/model_subject_6/vectors.txt'

### Min Edge Cut Partitions
#### First the walks

In [2]:
for i in range(0,10):
    os.mkdir("./walks/walks_minedgecut_{0}".format(i))

In [3]:
for file in listdir("./data/data_minedgecut/"):
    decompress(join("./data/data_minedgecut/", file))

Decompressing: ./data/data_minedgecut/chunk0.nq
Decompressing: ./data/data_minedgecut/chunk1.nq
Decompressing: ./data/data_minedgecut/chunk2.nq
Decompressing: ./data/data_minedgecut/chunk3.nq
Decompressing: ./data/data_minedgecut/chunk4.nq
Decompressing: ./data/data_minedgecut/chunk5.nq
Decompressing: ./data/data_minedgecut/chunk6.nq
Decompressing: ./data/data_minedgecut/chunk7.nq
Decompressing: ./data/data_minedgecut/chunk8.nq
Decompressing: ./data/data_minedgecut/chunk9.nq


In [4]:
for i in range(0,10):
    !java -Xmx32g -jar jrdf2vec.jar -graph "./data/data_minedgecut/chunk{i}.nq" -onlyWalks -walkDirectory "./walks/walks_minedgecut_{i}" > "./logs/walk_log_minedgecut_{i}.txt"

#### and then the embeddings

In [5]:
for i in range(0,10):
    os.mkdir("./models/model_minedgecut_{0}".format(i))

In [24]:
# not possible to run in a loop without having them all run at the same time
nr = 9
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory "./walks/walks_minedgecut_{nr}" > "./logs/embedding_log_minedgecut_{nr}.txt"

In [25]:
for file in glob.glob("./walks/walks_minedgecut_{0}/m*".format(nr)):
    shutil.move(file, "./models/model_minedgecut_{0}/".format(nr))
shutil.move("./walks/walks_minedgecut_{0}/vectors.txt".format(nr), "./models/model_minedgecut_{0}/".format(nr))

'./models/model_minedgecut_9/vectors.txt'

### Alignment

In [6]:
absolute_orientation("model_horizontal_1/vectors.txt")

TypeError: cannot unpack non-iterable int object

## Evaluation with GEval
If the flag for getting the txt file compatible with GEval was not set earlier, it's possible to turn the model or vector file into a txt file like this:

In [None]:
!java -jar jrdf2vec.jar -generateTextVectorFile ./path-to-your-model-or-vector-file

Method to evaluate the models task by task:

In [2]:
def evaluate (model_path: str = None, task: str = None):
    assert task is not None, "No task given, please choose out of Classification, Regression, Clustering, EntityRelatedness, DocumentSimilarity or SemanticAnalogies."
    assert model_path is not None, "No path to a model given, please specify one."

    evaluation_manager = FrameworkManager()
    evaluation_manager.evaluate(
        model_path,
        parallel=False,
        tasks=[task],
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

### The embedding of the complete graph

In [2]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_complete_nq", "vectors.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.7461904761904762
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.6164285714285714
Classification model initialized
Classification training...
Classification C45 None accuracy 0.598095238095238
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.497857

### Of the horizontal partitions

In [11]:
combine_vectors("./models/model_horizontal_*/vectors.txt", "./models/model_horizontal/vectors.txt", True)

In [5]:
entities = set()
count = 0

with open("./models/model_horizontal/vectors.txt", "r") as infile:
    for line in infile:
        e = line.split(' ', 1)[0]
        if(e not in entities):
            entities.add(e)
        else:
            count += 1

print(count)

3090514


In [12]:
combine_vectors("./models/model_horizontal_*/vectors.txt", "./models/model_horizontal/vectors_nd.txt", False)

In [8]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_horizontal", "vectors.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.3682241623177096
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.5883065892796175
Classification model initialized
Classification training...
Classification C45 None accuracy 0.4615275813295616
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.495556747793006
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.495556747793006
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.4955567

In [2]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_horizontal", "vectors_nd.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.4447619047619048
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.6071428571428571
Classification model initialized
Classification training...
Classification C45 None accuracy 0.4357142857142858
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.49785

### Of the min edge cut partitions

In [13]:
combine_vectors("./models/model_minedgecut_*/vectors.txt", "./models/model_minedgecut/vectors.txt", True)

In [15]:
combine_vectors("./models/model_minedgecut_*/vectors.txt", "./models/model_minedgecut/vectors_nd.txt", False)

In [8]:
evaluate(join("models", "model_minedgecut", "vectors.txt"), "Classification")

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.3765949306551889
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.5597513151602104
Classification model initialized
Classification training...
Classification C45 None accuracy 0.4711262553802008
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4960808225729316
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4960808225729316
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.49853

In [4]:
evaluate(join("models", "model_minedgecut", "vectors.txt"), "Regression")

Start evaluation...
TXT data manager initialized
Created evaluation manager
Regression data manager initialized
Regression task manager initialized
Regression : Ignored data: 3
Regression : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Regression model initialized
Regression training...
LR None root_mean_squared_error 20.24121006374304
Regression model initialized
Regression training...
KNN K=3 root_mean_squared_error 20.204437570786272
Regression model initialized
Regression training...
M5 None root_mean_squared_error 26.718012033830426
Regression model initialized
Regression training...
LR None root_mean_squared_error 20.265947268268835
Regression model initialized
Regression training...
KNN K=3 root_mean_squared_error 20.48915561682966
Regression model initialized
Regression training...
M5 None root_mean_squared_error 27.581166457346775
Regression model initialized
Regression training...
LR None root_

In [4]:
evaluate(join("models", "model_minedgecut", "vectors.txt"), "Clustering")

Start evaluation...
TXT data manager initialized
Created evaluation manager
Clustering data manager initialized
Clustering task manager initialized
Clustering: Ignored data : 353
Clustering : Ignored data: ['http://dbpedia.org/resource/Zongo', 'http://dbpedia.org/resource/Kumenan,_Okayama', 'http://dbpedia.org/resource/Bisheaba', "http://dbpedia.org/resource/St_Michael's_Church,_Michael_Patnam", 'http://dbpedia.org/resource/Hirata,_Shimane', 'http://dbpedia.org/resource/Shere,_Nigeria', 'http://dbpedia.org/resource/Shrinking_cities', 'http://dbpedia.org/resource/Hayashima,_Okayama', 'http://dbpedia.org/resource/Lekhgaun,_Seti', 'http://dbpedia.org/resource/Sarigam_INA', 'http://dbpedia.org/resource/Ausa', 'http://dbpedia.org/resource/Bhinder,_Udaipur', 'http://dbpedia.org/resource/James_Nelson_Lee', 'http://dbpedia.org/resource/Albany,_Decatur', 'http://dbpedia.org/resource/Al-Qusayr,_Egypt', 'http://dbpedia.org/resource/Kotwa,_India', 'http://dbpedia.org/resource/Asuke,_Aichi', 'http:

In [5]:
evaluate(join("models", "model_minedgecut", "vectors.txt"), "EntityRelatedness")

Start evaluation...
TXT data manager initialized
Created evaluation manager
Entity relatedness data manager initialized
Entity relatedness task manager initialized
Entity relatedness: Ignored data: 0
Entity relatedness: Ignored data: 1
Entity relatedness : Ignored data: http://dbpedia.org/resource/Infinite_Loop_(street)
Entity relatedness: Ignored data: 1
Entity relatedness : Ignored data: http://dbpedia.org/resource/Bing
Entity relatedness: Ignored data: 1
Entity relatedness : Ignored data: http://dbpedia.org/resource/Crunchie
Entity relatedness: Ignored data: 0
Entity relatedness: Ignored data: 2
Entity relatedness : Ignored data: http://dbpedia.org/resource/IBM_DB2
Entity relatedness : Ignored data: http://dbpedia.org/resource/Smarter_Planet
Entity relatedness: Ignored data: 0
Entity relatedness: Ignored data: 1
Entity relatedness : Ignored data: http://dbpedia.org/resource/Rusty_Ryan
Entity relatedness: Ignored data: 0
Entity relatedness: Ignored data: 0
Entity relatedness: Ignored

In [3]:
evaluate(join("models", "model_minedgecut", "vectors.txt"), "DocumentSimilarity")

Start evaluation...
TXT data manager initialized
Created evaluation manager
Document similarity data manager initialized
Document Similarity task manager initialized
Document similarity: Ignored data : 50
Document similarity : Ignored data: ['http://dbpedia.org/resource/Interim_leader', 'http://dbpedia.org/resource/Interim_leader', 'http://dbpedia.org/resource/Interim', 'http://dbpedia.org/resource/Adenosine_monophosphate', 'http://dbpedia.org/resource/Earnings_growth', 'http://dbpedia.org/resource/Earnings', 'http://dbpedia.org/resource/Zimbabwe_Republic_Police', 'http://dbpedia.org/resource/Terrorist_training_camp', 'http://dbpedia.org/resource/Kurdistan_Uyezd', 'http://dbpedia.org/resource/Race_and_ethnicity_in_the_United_States_Census', 'http://dbpedia.org/resource/Race_and_ethnicity_in_the_United_States_Census', 'http://dbpedia.org/resource/Tahir_Jalil_Habbush_al-Tikriti', 'http://dbpedia.org/resource/Abbas_Khalaf', 'http://dbpedia.org/resource/Smoking_gun', 'http://dbpedia.org/re

In [3]:
evaluate(join("models", "model_minedgecut", "vectors.txt"), "SemanticAnalogies")

Start evaluation...
TXT data manager initialized
Created evaluation manager
Semantic analogies data manager initialized
SemanticAnalogies task manager initialized
Semantic analogies:0 ignored quadruples
SemanticAnalogies model initialized
SemanticAnalogies : ACCURACY TOP 2: 0.65% (329/506)
Semantic analogies:0 ignored quadruples
SemanticAnalogies model initialized
SemanticAnalogies : ACCURACY TOP 2: 0.36% (1621/4524)
Semantic analogies:112 ignored quadruples
Semantic analogies: Ignored quadruplet ['http://dbpedia.org/resource/Algeria', 'http://dbpedia.org/resource/Canada', 'http://dbpedia.org/resource/Dinar', 'http://dbpedia.org/resource/Dollar']
Semantic analogies: Ignored quadruplet ['http://dbpedia.org/resource/Algeria', 'http://dbpedia.org/resource/United_States', 'http://dbpedia.org/resource/Dinar', 'http://dbpedia.org/resource/Dollar']
Semantic analogies: Ignored quadruplet ['http://dbpedia.org/resource/Angola', 'http://dbpedia.org/resource/Canada', 'http://dbpedia.org/resource/A

In [2]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_minedgecut", "vectors_nd.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.45404761904761903
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.5650000000000001
Classification model initialized
Classification training...
Classification C45 None accuracy 0.5407142857142857
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.4978

: 

### Of the subject based partitions

Fails at the last classification task, rest not tested yet.

In [2]:
combine_vectors("./models/model_subject_*/vectors.txt", "./models/model_subject/vectors.txt")

In [2]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_subject", "vectors.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.33646102343376383
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.5807914873266379
Classification model initialized
Classification training...
Classification C45 None accuracy 0.4598469631755141
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4960808225729316
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4960808225729316
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.4980