In [1]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from datetime import datetime 
import time
from os import stat
import pandas as pd
from time import sleep
import pickle


with open('geval_entities.pickle', 'rb') as file:
    geval_entities = pickle.load(file)

In [3]:
!mkdir geval_embeddings

In [4]:
embedding_models = [
    'rdf2vec-cbow',
    'rdf2vec-cbow-oa',
    'rdf2vec-sg',
    'rdf2vec-sg-oa',
    'non-rdf2vec-ComplEx',
    'non-rdf2vec-DistMult',
    'non-rdf2vec-RESCAL',
    'non-rdf2vec-RotatE',
    'non-rdf2vec-TransE-L1',
    'non-rdf2vec-TransE-L2',
    'non-rdf2vec-TransR',
]

model_variants = [
    '-200-original',
    '-200-avgbin',
    '-128-autoencoded',
    '-256-autoencoded',
    '-512-autoencoded',
]

embedding_txt_filenames = [em+mv for mv in model_variants for em in embedding_models]

for embedding_txt_filename in embedding_txt_filenames:
    with open(f"embeddings/{embedding_txt_filename}.txt", "r") as file:
        lines = file.readlines()

    with open(f"geval_embeddings/geval-{embedding_txt_filename}.txt", "w") as file:
        rows_count = 0
        for line in lines:
            if line.split()[0] in geval_entities:
                file.write(line)
                rows_count += 1
    
    print(f"geval_embeddings/geval-{embedding_txt_filename}.txt : {rows_count} entities filtered")

geval_embeddings/geval-rdf2vec-cbow-200-original.txt : 21568 entities filtered
geval_embeddings/geval-rdf2vec-cbow-oa-200-original.txt : 21535 entities filtered
geval_embeddings/geval-rdf2vec-sg-200-original.txt : 18204 entities filtered
geval_embeddings/geval-rdf2vec-sg-oa-200-original.txt : 21535 entities filtered
geval_embeddings/geval-non-rdf2vec-ComplEx-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-DistMult-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-RESCAL-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-RotatE-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-TransE-L1-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-TransE-L2-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-TransR-200-original.txt : 21672 entities filtered
geval_embeddings/geval-rdf2vec-cbow-200-avgbin.txt : 21568 entities filtered
g

In [5]:
from evaluation_framework.manager import FrameworkManager

evaluation_manager = FrameworkManager()

Start evaluation...


In [6]:
def run_task_for_all_vectors(task_name, embedding_txt_filenames, similarity_metric='manhattan', top_k=10):
    metadata_json = []
    for embedding_txt_filename in embedding_txt_filenames:
        file_size = stat(f"geval_embeddings/geval-{embedding_txt_filename}.txt")
        vector_size = int(embedding_txt_filename.split('-')[-2])
        print(f"Vector file: {embedding_txt_filename}; Dimensions: {vector_size}")

        start_time = datetime.now() 
        evaluation_manager.evaluate(
            f"geval_embeddings/geval-{embedding_txt_filename}.txt",
            vector_size=vector_size,
            debugging_mode=False,
            tasks=[task_name],
            similarity_metric=similarity_metric,
            top_k=top_k,
        )
        end_time = datetime.now()
        metadata_json.append({
            "embedding_file": embedding_txt_filename,
            "task_name": task_name,
            "file_size": file_size.st_size,
            "time_to_run": end_time - start_time,
        })
        sleep(1)
        
    return metadata_json


In [7]:
task_name = 'DocumentSimilarity'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)
metadata_df.to_csv(f'{task_name}_metadata.csv', index=False)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Document similarity finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Document similarity finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Document similarity finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Document similarity finished
Vector file:

In [8]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,DocumentSimilarity,88101211,00:00:13.020777
1,rdf2vec-cbow-oa-200-original,DocumentSimilarity,89030608,00:00:11.648480
2,rdf2vec-sg-200-original,DocumentSimilarity,74793299,00:00:11.738162
3,rdf2vec-sg-oa-200-original,DocumentSimilarity,87883088,00:00:11.704351
4,non-rdf2vec-ComplEx-200-original,DocumentSimilarity,88154366,00:00:11.810177
5,non-rdf2vec-DistMult-200-original,DocumentSimilarity,88088311,00:00:11.628429
6,non-rdf2vec-RESCAL-200-original,DocumentSimilarity,89677247,00:00:11.648436
7,non-rdf2vec-RotatE-200-original,DocumentSimilarity,90695298,00:00:11.664130
8,non-rdf2vec-TransE-L1-200-original,DocumentSimilarity,91713818,00:00:12.061363
9,non-rdf2vec-TransE-L2-200-original,DocumentSimilarity,87218400,00:00:11.721442


In [9]:
task_name = 'EntityRelatedness'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames, similarity_metric='cosine')
metadata_df = pd.DataFrame(metadata_json)
metadata_df.to_csv(f'{task_name}_metadata.csv', index=False)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-cb

In [10]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,EntityRelatedness,88101211,00:00:01.744175
1,rdf2vec-cbow-oa-200-original,EntityRelatedness,89030608,00:00:01.567031
2,rdf2vec-sg-200-original,EntityRelatedness,74793299,00:00:01.366559
3,rdf2vec-sg-oa-200-original,EntityRelatedness,87883088,00:00:01.559937
4,non-rdf2vec-ComplEx-200-original,EntityRelatedness,88154366,00:00:01.624066
5,non-rdf2vec-DistMult-200-original,EntityRelatedness,88088311,00:00:01.596018
6,non-rdf2vec-RESCAL-200-original,EntityRelatedness,89677247,00:00:01.586189
7,non-rdf2vec-RotatE-200-original,EntityRelatedness,90695298,00:00:01.588823
8,non-rdf2vec-TransE-L1-200-original,EntityRelatedness,91713818,00:00:01.634538
9,non-rdf2vec-TransE-L2-200-original,EntityRelatedness,87218400,00:00:01.917518


In [11]:
task_name = 'SemanticAnalogies'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)
metadata_df.to_csv(f'{task_name}_metadata.csv', index=False)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-cb

In [12]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,SemanticAnalogies,88101211,00:00:21.333947
1,rdf2vec-cbow-oa-200-original,SemanticAnalogies,89030608,00:00:20.759903
2,rdf2vec-sg-200-original,SemanticAnalogies,74793299,00:00:00.822558
3,rdf2vec-sg-oa-200-original,SemanticAnalogies,87883088,00:00:20.808519
4,non-rdf2vec-ComplEx-200-original,SemanticAnalogies,88154366,00:00:21.903374
5,non-rdf2vec-DistMult-200-original,SemanticAnalogies,88088311,00:00:22.073671
6,non-rdf2vec-RESCAL-200-original,SemanticAnalogies,89677247,00:00:21.912656
7,non-rdf2vec-RotatE-200-original,SemanticAnalogies,90695298,00:00:21.702357
8,non-rdf2vec-TransE-L1-200-original,SemanticAnalogies,91713818,00:00:21.906336
9,non-rdf2vec-TransE-L2-200-original,SemanticAnalogies,87218400,00:00:22.055592


In [13]:
task_name = 'Clustering'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)
metadata_df.to_csv(f'{task_name}_metadata.csv', index=False)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-cbow-200-avgbin; Dimensions: 200
Clustering finished
Vector file: rdf2vec-cbow-oa-200-avgb

In [14]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,Clustering,88101211,00:01:08.023447
1,rdf2vec-cbow-oa-200-original,Clustering,89030608,00:00:39.010951
2,rdf2vec-sg-200-original,Clustering,74793299,00:00:25.117615
3,rdf2vec-sg-oa-200-original,Clustering,87883088,00:00:35.823996
4,non-rdf2vec-ComplEx-200-original,Clustering,88154366,00:00:30.530081
5,non-rdf2vec-DistMult-200-original,Clustering,88088311,00:00:30.015616
6,non-rdf2vec-RESCAL-200-original,Clustering,89677247,00:00:30.964916
7,non-rdf2vec-RotatE-200-original,Clustering,90695298,00:00:30.149859
8,non-rdf2vec-TransE-L1-200-original,Clustering,91713818,00:00:29.808124
9,non-rdf2vec-TransE-L2-200-original,Clustering,87218400,00:00:29.628249


In [15]:
task_name = 'Regression'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)
metadata_df.to_csv(f'{task_name}_metadata.csv', index=False)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-cbow-200-avgbin; Dimensions: 200
Regression finished
Vector file: rdf2vec-cbow-oa-200-avgb

In [16]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,Regression,88101211,00:02:15.617759
1,rdf2vec-cbow-oa-200-original,Regression,89030608,00:02:24.144746
2,rdf2vec-sg-200-original,Regression,74793299,00:02:09.025446
3,rdf2vec-sg-oa-200-original,Regression,87883088,00:02:21.740381
4,non-rdf2vec-ComplEx-200-original,Regression,88154366,00:02:32.347417
5,non-rdf2vec-DistMult-200-original,Regression,88088311,00:02:34.835933
6,non-rdf2vec-RESCAL-200-original,Regression,89677247,00:02:46.442763
7,non-rdf2vec-RotatE-200-original,Regression,90695298,00:02:29.824746
8,non-rdf2vec-TransE-L1-200-original,Regression,91713818,00:02:23.887291
9,non-rdf2vec-TransE-L2-200-original,Regression,87218400,00:02:24.781230


In [17]:
task_name = 'Classification'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)
metadata_df.to_csv(f'{task_name}_metadata.csv', index=False)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Classification finished
0:15:50
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Classification finished
0:15:54
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Classification finished
0:13:11
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Classification finished
0:15:32
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Classification finished
0:16:14
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Classification finished
0:16:07
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Classification finished
0:15:46
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Classification finished
0:16:13
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Classification finished
0:15:47
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Classification finished
0:14:47
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Classificat

In [18]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,Classification,88101211,00:15:51.201508
1,rdf2vec-cbow-oa-200-original,Classification,89030608,00:15:54.788892
2,rdf2vec-sg-200-original,Classification,74793299,00:13:12.096213
3,rdf2vec-sg-oa-200-original,Classification,87883088,00:15:32.784851
4,non-rdf2vec-ComplEx-200-original,Classification,88154366,00:16:15.257581
5,non-rdf2vec-DistMult-200-original,Classification,88088311,00:16:07.862739
6,non-rdf2vec-RESCAL-200-original,Classification,89677247,00:15:47.011247
7,non-rdf2vec-RotatE-200-original,Classification,90695298,00:16:14.271388
8,non-rdf2vec-TransE-L1-200-original,Classification,91713818,00:15:47.893422
9,non-rdf2vec-TransE-L2-200-original,Classification,87218400,00:14:48.120111
