# Run GEval experiments

This notebook consists of running the GEval Evaluation Framework with DBpedia gold standards on a list of previously computed embedding files (obtained with the notebook `get_vectors_pipeline.ipynb`). Results are analyzed in a separate notebook (`analyze_geval.ipynb`).

### Setup before running this notebook for the first time:

1. Clone the GEval repo (from Pellegrino et al.) to the folder `geval`. 

```
git clone https://github.com/mariaangelapellegrino/Evaluation-Framework.git geval
```

2. Move this notebook, the pickle file `geval_entities.pickle`, and the complete `embeddings` folder to `geval`.

The complete execution of all tasks for all 55 variants may take up to 24 hours. Results will be saved in a newly created `comparison.csv` file


In [1]:
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
from datetime import datetime 
import time
from os import stat
import pandas as pd
from time import sleep
import pickle


with open('geval_entities.pickle', 'rb') as file:
    geval_entities = pickle.load(file)

In [3]:
!mkdir geval_embeddings

In [4]:
embedding_models = [
    'rdf2vec-cbow',
    'rdf2vec-cbow-oa',
    'rdf2vec-sg',
    'rdf2vec-sg-oa',
    'non-rdf2vec-ComplEx',
    'non-rdf2vec-DistMult',
    'non-rdf2vec-RESCAL',
    'non-rdf2vec-RotatE',
    'non-rdf2vec-TransE-L1',
    'non-rdf2vec-TransE-L2',
    'non-rdf2vec-TransR',
]

model_variants = [
    '-200-original',
    '-200-avgbin',
    '-128-autoencoded',
    '-256-autoencoded',
    '-512-autoencoded',
]

embedding_txt_filenames = [em+mv for mv in model_variants for em in embedding_models]

for embedding_txt_filename in embedding_txt_filenames:
    with open(f"embeddings/{embedding_txt_filename}.txt", "r") as file:
        lines = file.readlines()

    with open(f"geval_embeddings/geval-{embedding_txt_filename}.txt", "w") as file:
        rows_count = 0
        for line in lines:
            if line.split()[0] in geval_entities:
                file.write(line)
                rows_count += 1
    
    print(f"geval_embeddings/geval-{embedding_txt_filename}.txt : {rows_count} entities filtered")

geval_embeddings/geval-rdf2vec-cbow-200-original.txt : 21568 entities filtered
geval_embeddings/geval-rdf2vec-cbow-oa-200-original.txt : 21535 entities filtered
geval_embeddings/geval-rdf2vec-sg-200-original.txt : 18204 entities filtered
geval_embeddings/geval-rdf2vec-sg-oa-200-original.txt : 21535 entities filtered
geval_embeddings/geval-non-rdf2vec-ComplEx-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-DistMult-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-RESCAL-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-RotatE-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-TransE-L1-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-TransE-L2-200-original.txt : 21672 entities filtered
geval_embeddings/geval-non-rdf2vec-TransR-200-original.txt : 21672 entities filtered
geval_embeddings/geval-rdf2vec-cbow-200-avgbin.txt : 21568 entities filtered
g

In [5]:
from evaluation_framework.manager import FrameworkManager

evaluation_manager = FrameworkManager()

Start evaluation...


In [6]:
def run_task_for_all_vectors(task_name, embedding_txt_filenames, similarity_metric='manhattan', top_k=10):
    metadata_json = []
    for embedding_txt_filename in embedding_txt_filenames:
        file_size = stat(f"geval_embeddings/geval-{embedding_txt_filename}.txt")
        vector_size = int(embedding_txt_filename.split('-')[-2])
        print(f"Vector file: {embedding_txt_filename}; Dimensions: {vector_size}")

        start_time = datetime.now() 
        evaluation_manager.evaluate(
            f"geval_embeddings/geval-{embedding_txt_filename}.txt",
            vector_size=vector_size,
            debugging_mode=False,
            tasks=[task_name],
            similarity_metric=similarity_metric,
            top_k=top_k,
        )
        end_time = datetime.now()
        metadata_json.append({
            "embedding_file": embedding_txt_filename,
            "task_name": task_name,
            "file_size": file_size.st_size,
            "time_to_run": end_time - start_time,
        })
        sleep(1)
        
    return metadata_json


In [7]:
task_name = 'DocumentSimilarity'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Document similarity finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Document similarity finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Document similarity finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Document similarity finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Document similarity finished
Vector file:

In [8]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,DocumentSimilarity,88101211,00:00:27.523149
1,rdf2vec-cbow-oa-200-original,DocumentSimilarity,89030608,00:00:12.765897
2,rdf2vec-sg-200-original,DocumentSimilarity,74793299,00:00:15.068538
3,rdf2vec-sg-oa-200-original,DocumentSimilarity,87883088,00:00:15.838230
4,non-rdf2vec-ComplEx-200-original,DocumentSimilarity,88154366,00:00:15.304203
5,non-rdf2vec-DistMult-200-original,DocumentSimilarity,88088311,00:00:14.346491
6,non-rdf2vec-RESCAL-200-original,DocumentSimilarity,89677247,00:00:15.998940
7,non-rdf2vec-RotatE-200-original,DocumentSimilarity,90695298,00:00:16.275585
8,non-rdf2vec-TransE-L1-200-original,DocumentSimilarity,91713818,00:00:14.589539
9,non-rdf2vec-TransE-L2-200-original,DocumentSimilarity,87218400,00:00:14.387379


In [9]:
task_name = 'EntityRelatedness'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames, similarity_metric='cosine')
metadata_df = pd.DataFrame(metadata_json)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Entity Relatedness finished
Vector file: rdf2vec-cb

In [10]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,EntityRelatedness,88101211,00:00:02.006970
1,rdf2vec-cbow-oa-200-original,EntityRelatedness,89030608,00:00:01.845845
2,rdf2vec-sg-200-original,EntityRelatedness,74793299,00:00:02.054639
3,rdf2vec-sg-oa-200-original,EntityRelatedness,87883088,00:00:01.776899
4,non-rdf2vec-ComplEx-200-original,EntityRelatedness,88154366,00:00:01.810365
5,non-rdf2vec-DistMult-200-original,EntityRelatedness,88088311,00:00:02.325595
6,non-rdf2vec-RESCAL-200-original,EntityRelatedness,89677247,00:00:01.734417
7,non-rdf2vec-RotatE-200-original,EntityRelatedness,90695298,00:00:01.973014
8,non-rdf2vec-TransE-L1-200-original,EntityRelatedness,91713818,00:00:02.011721
9,non-rdf2vec-TransE-L2-200-original,EntityRelatedness,87218400,00:00:01.774146


In [11]:
task_name = 'SemanticAnalogies'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Semantic Analogies finished
Vector file: rdf2vec-cb

In [12]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,SemanticAnalogies,88101211,00:00:23.638591
1,rdf2vec-cbow-oa-200-original,SemanticAnalogies,89030608,00:00:22.914469
2,rdf2vec-sg-200-original,SemanticAnalogies,74793299,00:00:01.151807
3,rdf2vec-sg-oa-200-original,SemanticAnalogies,87883088,00:00:23.890708
4,non-rdf2vec-ComplEx-200-original,SemanticAnalogies,88154366,00:00:23.162130
5,non-rdf2vec-DistMult-200-original,SemanticAnalogies,88088311,00:00:24.192060
6,non-rdf2vec-RESCAL-200-original,SemanticAnalogies,89677247,00:00:23.089034
7,non-rdf2vec-RotatE-200-original,SemanticAnalogies,90695298,00:00:24.986332
8,non-rdf2vec-TransE-L1-200-original,SemanticAnalogies,91713818,00:00:24.174404
9,non-rdf2vec-TransE-L2-200-original,SemanticAnalogies,87218400,00:00:23.232815


In [13]:
task_name = 'Clustering'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Clustering finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Clustering finished
Vector file: rdf2vec-cbow-200-avgbin; Dimensions: 200
Clustering finished
Vector file: rdf2vec-cbow-oa-200-avgb

In [14]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,Clustering,88101211,00:01:18.502461
1,rdf2vec-cbow-oa-200-original,Clustering,89030608,00:00:44.059722
2,rdf2vec-sg-200-original,Clustering,74793299,00:00:27.683736
3,rdf2vec-sg-oa-200-original,Clustering,87883088,00:00:39.516318
4,non-rdf2vec-ComplEx-200-original,Clustering,88154366,00:00:32.459968
5,non-rdf2vec-DistMult-200-original,Clustering,88088311,00:00:35.592732
6,non-rdf2vec-RESCAL-200-original,Clustering,89677247,00:00:33.832913
7,non-rdf2vec-RotatE-200-original,Clustering,90695298,00:00:40.219248
8,non-rdf2vec-TransE-L1-200-original,Clustering,91713818,00:00:53.039863
9,non-rdf2vec-TransE-L2-200-original,Clustering,87218400,00:00:40.695957


In [15]:
task_name = 'Regression'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Regression finished
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Regression finished
Vector file: rdf2vec-cbow-200-avgbin; Dimensions: 200
Regression finished
Vector file: rdf2vec-cbow-oa-200-avgb

In [16]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,Regression,88101211,00:02:21.194312
1,rdf2vec-cbow-oa-200-original,Regression,89030608,00:02:31.134330
2,rdf2vec-sg-200-original,Regression,74793299,00:02:25.145999
3,rdf2vec-sg-oa-200-original,Regression,87883088,00:02:39.107541
4,non-rdf2vec-ComplEx-200-original,Regression,88154366,00:02:48.427155
5,non-rdf2vec-DistMult-200-original,Regression,88088311,00:02:41.679470
6,non-rdf2vec-RESCAL-200-original,Regression,89677247,00:02:57.722374
7,non-rdf2vec-RotatE-200-original,Regression,90695298,00:02:45.466372
8,non-rdf2vec-TransE-L1-200-original,Regression,91713818,00:04:10.687646
9,non-rdf2vec-TransE-L2-200-original,Regression,87218400,00:02:36.573481


In [None]:
task_name = 'Classification'
metadata_json = run_task_for_all_vectors(task_name, embedding_txt_filenames)
metadata_df = pd.DataFrame(metadata_json)

Vector file: rdf2vec-cbow-200-original; Dimensions: 200
Classification finished
0:16:21
Vector file: rdf2vec-cbow-oa-200-original; Dimensions: 200
Classification finished
0:16:35
Vector file: rdf2vec-sg-200-original; Dimensions: 200
Classification finished
0:14:00
Vector file: rdf2vec-sg-oa-200-original; Dimensions: 200
Classification finished
0:16:08
Vector file: non-rdf2vec-ComplEx-200-original; Dimensions: 200
Classification finished
0:17:21
Vector file: non-rdf2vec-DistMult-200-original; Dimensions: 200
Classification finished
0:17:18
Vector file: non-rdf2vec-RESCAL-200-original; Dimensions: 200
Classification finished
0:16:43
Vector file: non-rdf2vec-RotatE-200-original; Dimensions: 200
Classification finished
0:17:14
Vector file: non-rdf2vec-TransE-L1-200-original; Dimensions: 200
Classification finished
0:16:56
Vector file: non-rdf2vec-TransE-L2-200-original; Dimensions: 200
Classification finished
0:15:51
Vector file: non-rdf2vec-TransR-200-original; Dimensions: 200
Classificat

In [22]:
metadata_df

Unnamed: 0,embedding_file,task_name,file_size,time_to_run
0,rdf2vec-cbow-200-original,Classification,88101211,00:16:22.440418
1,rdf2vec-cbow-oa-200-original,Classification,89030608,00:16:36.508233
2,rdf2vec-sg-200-original,Classification,74793299,00:14:01.079060
3,rdf2vec-sg-oa-200-original,Classification,87883088,00:16:09.781790
4,non-rdf2vec-ComplEx-200-original,Classification,88154366,00:17:22.456304
5,non-rdf2vec-DistMult-200-original,Classification,88088311,00:17:18.902092
6,non-rdf2vec-RESCAL-200-original,Classification,89677247,00:16:44.570921
7,non-rdf2vec-RotatE-200-original,Classification,90695298,00:17:15.122072
8,non-rdf2vec-TransE-L1-200-original,Classification,91713818,00:16:56.870397
9,non-rdf2vec-TransE-L2-200-original,Classification,87218400,00:15:52.286560
