# Knowledge Graph Embedding Partitioning

In [2]:
import glob
import shutil
from os import listdir
from os.path import join, dirname

import matplotlib.pyplot as plt
import pandas as pd
from evaluation_framework.manager import FrameworkManager
import evaluation_framework as geval

from src.utils import *
from src.alignment import *


## Getting the data and getting it ready to be used

In [3]:
with open("links2016-04.txt", "r") as link_file:
    links = [l[:-1] if l[-1:] == "\n" else l for l in link_file]

get_all_data(links)

100%|██████████| 223M/223M [00:19<00:00, 11.3MiB/s] 


Decompressing: data\article_categories_en.ttl


100%|██████████| 42.8M/42.8M [00:03<00:00, 10.8MiB/s]


Decompressing: data\instance_types_en.ttl


100%|██████████| 134M/134M [00:20<00:00, 6.66MiB/s] 


Decompressing: data\instance_types_transitive_en.ttl


100%|██████████| 167M/167M [00:14<00:00, 11.1MiB/s] 


Decompressing: data\mappingbased_objects_en.ttl


100%|██████████| 42.3M/42.3M [00:03<00:00, 11.1MiB/s]


Decompressing: data\skos_categories_en.ttl


## Setting up jRDF2vec

In [7]:
download("https://github.com/dwslab/jRDF2Vec/blob/jars/jars/jrdf2vec-1.3-SNAPSHOT.jar?raw=true", "jrdf2vec.jar")

100%|██████████| 24.8M/24.8M [00:02<00:00, 9.27MiB/s]


In [2]:
!java -jar jrdf2vec.jar -checkInstallation

Using server port: 1808
06 Juni 2023 04:32:47 DEBUG [main] (RequestAddCookies.java:123) - CookieSpec selected: default
06 Juni 2023 04:32:47 DEBUG [main] (RequestAuthCache.java:77) - Auth cache not set in the context
06 Juni 2023 04:32:47 DEBUG [main] (PoolingHttpClientConnectionManager.java:267) - Connection request: [route: {}->http://127.0.0.1:1808][total available: 0; route allocated: 0 of 2; total allocated: 0 of 20]
06 Juni 2023 04:32:47 DEBUG [main] (PoolingHttpClientConnectionManager.java:312) - Connection leased: [id: 0][route: {}->http://127.0.0.1:1808][total available: 0; route allocated: 1 of 2; total allocated: 1 of 20]
06 Juni 2023 04:32:47 DEBUG [main] (MainClientExec.java:234) - Opening connection {}->http://127.0.0.1:1808
06 Juni 2023 04:32:47 DEBUG [main] (DefaultHttpClientConnectionOperator.java:139) - Connecting to /127.0.0.1:1808
 * Serving Flask app 'python_server'
 * Debug mode: off
06 Juni 2023 04:32:49 DEBUG [main] (DefaultHttpClientConnectionOperator.java:146)

## Running jRDF2vec
### Baseline of the complete graph
#### First the walks

In [2]:
!java -Xmx32g -jar jrdf2vec.jar -graph ./data/data_copewm/mappingbased-objects_lang=en.nq -onlyWalks -walkDirectory ./walks/walks_complete > ./logs/walk_log_complete.txt

#### and then the embedding

In [3]:
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory ./walks/walks_complete_nq > ./logs/embedding_log_complete.txt

### Horizontal partitions
#### First the walks

In [20]:
for i in range(0,10):
    os.mkdir("./walks/walks_horizontal/walks_horizontal_{0}".format(i))

In [3]:
for file in listdir("./data/data_horizontal/"):
    decompress(join("./data/data_horizontal/", file))

Decompressing: ./data/data_horizontal/chunk0.nq
Decompressing: ./data/data_horizontal/chunk1.nq
Decompressing: ./data/data_horizontal/chunk2.nq
Decompressing: ./data/data_horizontal/chunk3.nq
Decompressing: ./data/data_horizontal/chunk4.nq
Decompressing: ./data/data_horizontal/chunk5.nq
Decompressing: ./data/data_horizontal/chunk6.nq
Decompressing: ./data/data_horizontal/chunk7.nq
Decompressing: ./data/data_horizontal/chunk8.nq
Decompressing: ./data/data_horizontal/chunk9.nq


In [4]:
for i in range(0,10):
    !java -Xmx32g -jar jrdf2vec.jar -graph "./data/data_horizontal/chunk{i}.nq" -onlyWalks -walkDirectory "./walks/walks_horizontal/walks_horizontal_{i}" > "./logs/horizontal_logs/walk_log_horizontal_{i}.txt"

#### and then the embedding

In [2]:
for i in range(0,10):
    os.mkdir("./models/model_horizontal/model_horizontal_{0}".format(i))

In [24]:
# not possible to run in a loop without having them all run at the same time
nr = 9
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory "./walks/walks_horizontal/walks_horizontal_{nr}" > "./logs/horizontal_logs/embedding_log_horizontal_{nr}.txt"

In [25]:
for file in glob.glob("./walks/walks_horizontal/walks_horizontal_{0}/m*".format(nr)):
    shutil.move(file, "./models/model_horizontal/model_horizontal_{0}/".format(nr))
shutil.move("./walks/walks_horizontal/walks_horizontal_{0}/vectors.txt".format(nr), "./models/model_horizontal/model_horizontal_{0}/".format(nr))

'./models/model_horizontal/model_horizontal_9/vectors.txt'

### Min Edge Cut Partitions
#### First the walks

In [2]:
for i in range(0,10):
    os.mkdir("./walks/walks_minedgecut/walks_minedgecut_{0}".format(i))

In [3]:
for file in listdir("./data/data_minedgecut/"):
    decompress(join("./data/data_minedgecut/", file))

Decompressing: ./data/data_minedgecut/chunk0.nq
Decompressing: ./data/data_minedgecut/chunk1.nq
Decompressing: ./data/data_minedgecut/chunk2.nq
Decompressing: ./data/data_minedgecut/chunk3.nq
Decompressing: ./data/data_minedgecut/chunk4.nq
Decompressing: ./data/data_minedgecut/chunk5.nq
Decompressing: ./data/data_minedgecut/chunk6.nq
Decompressing: ./data/data_minedgecut/chunk7.nq
Decompressing: ./data/data_minedgecut/chunk8.nq
Decompressing: ./data/data_minedgecut/chunk9.nq


In [4]:
for i in range(0,10):
    !java -Xmx32g -jar jrdf2vec.jar -graph "./data/data_minedgecut/chunk{i}.nq" -onlyWalks -walkDirectory "./walks/walks_mindedgecut/walks_minedgecut_{i}" > "./logs/minedgecut_logs/walk_log_minedgecut_{i}.txt"

#### and then the embeddings

In [2]:
for i in range(0,10):
    os.mkdir("./models/model_minedgecut/model_minedgecut_{0}".format(i))

In [21]:
# not possible to run in a loop without having them all run at the same time
nr = 9
!java -Xmx32g -jar jrdf2vec.jar -onlyTraining -dimension 100 -walkDirectory "./walks/walks_minedgecut/walks_minedgecut_{nr}" > "./logs/minedgecut_logs/embedding_log_minedgecut_{nr}.txt"

In [22]:
for file in glob.glob("./walks/walks_minedgecut/walks_minedgecut_{0}/m*".format(nr)):
    shutil.move(file, "./models/model_minedgecut/model_minedgecut_{0}/".format(nr))
shutil.move("./walks/walks_minedgecut/walks_minedgecut_{0}/vectors.txt".format(nr), "./models/model_minedgecut/model_minedgecut_{0}/".format(nr))

'./models/model_minedgecut/model_minedgecut_9/vectors.txt'

## Alignment

### For the horizontal partitions

In [4]:
for i in range(1, 10):
    absolute_orientation(
        join("model_horizontal", "model_horizontal_0", "vectors.txt"),
        join("model_horizontal", "model_horizontal_{0}".format(i), "vectors.txt"),
        join("model_horizontal", "aligned", "vectors_aligned{0}.txt".format(i))
    )

In [2]:
combine_aligned_vectors(
    "./models/model_horizontal/aligned/vectors_aligned*.txt",
    "./models/model_horizontal/model_horizontal_0/vectors.txt",
    "./models/model_horizontal/vectors_aligned0.txt",
    False
)

In [3]:
filter_vector_file(join("models", "model_horizontal", "vectors_aligned0.txt"), join("models", "model_horizontal", "vectors_aligned0_filtered.txt"))

### For the min edge cut partitions

In [3]:
for i in range(1, 10):
    absolute_orientation(
        join("model_minedgecut", "model_minedgecut_0", "vectors.txt"),
        join("model_minedgecut", "model_minedgecut_{0}".format(i), "vectors.txt"),
        join("model_minedgecut", "aligned", "vectors_aligned{0}.txt".format(i))
    )

In [4]:
combine_aligned_vectors(
    "./models/model_minedgecut/aligned/vectors_aligned*.txt",
    "./models/model_minedgecut/model_minedgecut_0/vectors.txt",
    "./models/model_minedgecut/vectors_aligned0.txt",
    False
)

In [5]:
filter_vector_file(join("models", "model_minedgecut", "vectors_aligned0.txt"), join("models", "model_minedgecut", "vectors_aligned0_filtered.txt"))

## Evaluation with GEval
If the flag for getting the txt file compatible with GEval was not set earlier, it's possible to turn the model or vector file into a txt file like this:

In [None]:
!java -jar jrdf2vec.jar -generateTextVectorFile ./path-to-your-model-or-vector-file

Method to evaluate the models task by task:

In [2]:
def evaluate (model_path: str = None, task: str = None):
    assert task is not None, "No task given, please choose out of Classification, Regression, Clustering, EntityRelatedness, DocumentSimilarity or SemanticAnalogies."
    assert model_path is not None, "No path to a model given, please specify one."

    evaluation_manager = FrameworkManager()
    evaluation_manager.evaluate(
        model_path,
        parallel=False,
        tasks=[task],
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

### The embedding of the complete graph

In [2]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_complete_nq", "vectors.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.7461904761904762
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.6164285714285714
Classification model initialized
Classification training...
Classification C45 None accuracy 0.598095238095238
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.497857

### Of the horizontal partitions

In [11]:
combine_vectors("./models/model_horizontal/model_horizontal_*/vectors.txt", "./models/model_horizontal/vectors.txt", True)

In [5]:
entities = set()
count = 0

with open("./models/model_horizontal/vectors.txt", "r") as infile:
    for line in infile:
        e = line.split(' ', 1)[0]
        if(e not in entities):
            entities.add(e)
        else:
            count += 1

print(count)

3090514


In [23]:
combine_vectors("./models/model_horizontal/model_horizontal_*/vectors.txt", "./models/model_horizontal/vectors_nd.txt", False)

In [24]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_horizontal", "vectors_nd.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.4830952380952381
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.621904761904762
Classification model initialized
Classification training...
Classification C45 None accuracy 0.5214285714285714
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.497857

### Of the min edge cut partitions

In [13]:
combine_vectors("./models/model_minedgecut/model_minedgecut_*/vectors.txt", "./models/model_minedgecut/vectors.txt", True)

In [2]:
combine_vectors("./models/model_minedgecut/model_minedgecut_*/vectors.txt", "./models/model_minedgecut/vectors_nd.txt", False)

In [3]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_minedgecut", "vectors_nd.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.45404761904761903
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.5788095238095237
Classification model initialized
Classification training...
Classification C45 None accuracy 0.49309523809523803
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.497

### Of the aligned partitions

#### First the horizontal partitions

In [4]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_horizontal", "vectors_aligned0_filtered.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.4878571428571429
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.6171428571428572
Classification model initialized
Classification training...
Classification C45 None accuracy 0.4738095238095238
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.49785

#### Then the min edge cut partitions

In [6]:
evaluation_manager = FrameworkManager()
evaluation_manager.evaluate(
        join("models", "model_minedgecut", "vectors_aligned0_filtered.txt"),
        parallel=False,
        debugging_mode=True,
        vector_file_format="txt",
        vector_size=100
    )

Start evaluation...
TXT data manager initialized
Created evaluation manager
Classification data manager initialized
Classification task manager initialized.
Classification : Ignored data: 3
Classification : Ignored data: ['http://dbpedia.org/resource/Omuta', 'http://dbpedia.org/resource/Katsuyama', 'http://dbpedia.org/resource/Lome']
Classification model initialized
Classification training...
Classification NB None accuracy 0.46380952380952384
Classification model initialized
Classification training...
Classification KNN K=3 accuracy 0.5883333333333334
Classification model initialized
Classification training...
Classification C45 None accuracy 0.5316666666666667
Classification model initialized
Classification training...
Classification SVM C=0.001 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.01 accuracy 0.4978571428571429
Classification model initialized
Classification training...
Classification SVM C=0.1 accuracy 0.4978