In [1]:
import pandas as pd
from scipy.stats import binomtest

In [2]:
df = pd.read_csv("comparison.csv", sep=" ")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10261 entries, 0 to 10260
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   test_name            10261 non-null  object 
 1   task_name            10261 non-null  object 
 2   gold_standard_file   10261 non-null  object 
 3   coverage             10261 non-null  float64
 4   model                10261 non-null  object 
 5   model_configuration  10261 non-null  object 
 6   metric               10261 non-null  object 
 7   score_value          10261 non-null  float64
dtypes: float64(2), object(6)
memory usage: 641.4+ KB


In [3]:
def shorten_test_name(test_name):
    test_name_short = test_name.split('_')[1]
    
    return test_name_short[17:]

def get_embedding_type(test_name_short):
    breakdown = test_name_short.split('-')
    
    embedding_type = '-'.join(breakdown[0:-2])
    
    if embedding_type.startswith("non-rdf2vec-"):
        return embedding_type.replace("non-rdf2vec-", "")
    
    else:
        return str.upper(embedding_type).replace("RDF2VEC-", "RDF2vec$_{") + "}$"

def get_embedding_variant(test_name_short):
    breakdown = test_name_short.split('-')
    
    return  breakdown[-1].replace("autoencoded", "auto") + "-" + breakdown[-2]

df['test_name_short'] = df['test_name'].apply(shorten_test_name)
df['embedding_type'] = df['test_name_short'].apply(get_embedding_type)
df['embedding_variant'] = df['test_name_short'].apply(get_embedding_variant)

print(df['embedding_type'].unique())
print(df['embedding_variant'].unique())

['RDF2vec$_{CBOW}$' 'RDF2vec$_{CBOW-OA}$' 'RDF2vec$_{SG}$'
 'RDF2vec$_{SG-OA}$' 'ComplEx' 'DistMult' 'RESCAL' 'RotatE' 'TransE-L1'
 'TransE-L2' 'TransR']
['original-200' 'avgbin-200' 'auto-128' 'auto-256' 'auto-512']


In [4]:
dataset_lengths = { 
    "AAUP": 933,
    "Cities": 212,
    "Forbes": 1585,
    "MetacriticAlbums": 1592,
    "MetacriticMovies": 2000,
    "cities2000AndCountries_cluster": 4343,
    "citiesAndCountries_cluster": 11179,
    "citiesMoviesAlbumsCompaniesUni_cluster": 6322,
    "teams_cluster": 4206,
    "currency_entities": 58, 
    "city_state_entities": 94,
    "capital_country_entities": 46, 
    "all_capital_country_entities": 232,
}

## Classification

In [5]:
df_clf = df[
    (df['task_name']=='Classification') 
    & (df['metric']=='accuracy')
    & (df['coverage']>=0.5)
].drop(columns=['test_name'])
df_clf['rank'] = df_clf.groupby(by=['test_name_short', 'gold_standard_file'])['score_value'].rank(
    method='first', 
    ascending=False,
)
df_clf_bestscores = df_clf[df_clf['rank']==1.0].drop(columns=['rank', 'task_name']).sort_values(
    by=['gold_standard_file', 'score_value'], ascending=False
)

df_clf_bestscores['gold_standard_file'].unique()

array(['MetacriticMovies', 'MetacriticAlbums', 'Forbes', 'Cities', 'AAUP'],
      dtype=object)

In [6]:
def calculate_pvalue_clf(gold_standard_file, score_value, embedding_type):
    n = dataset_lengths.get(gold_standard_file)
    successes = round(n*score_value)
    p = df_clf_bestscores[
        (df_clf_bestscores["gold_standard_file"]==gold_standard_file)
        & (df_clf_bestscores["embedding_type"]==embedding_type)
        & (df_clf_bestscores["embedding_variant"]=="original-200")
    ]["score_value"].iloc[0]

    return binomtest(successes, n, p, alternative='less').pvalue
    
df_clf_bestscores["pvalue_worse_than_original"] = df_clf_bestscores.apply(
    lambda x: calculate_pvalue_clf(
        x.gold_standard_file, 
        x.score_value, 
        x.embedding_type,
    ) if x.embedding_variant!="original-200" else None, 
    axis=1,
)

alpha=0.05
df_clf_bestscores["not_worse_than_original"] = df_clf_bestscores["pvalue_worse_than_original"] >= alpha

In [7]:
caption="""Count of GEval classification datasets in which the best classifier 
of each binary embedding variant did not significantly underperfom the original one in accuracy. 
The closer to 5, the less is the performance loss. $\\alpha=0.05$."""

df_clf_sig_not_worst = pd.pivot_table(
    df_clf_bestscores[df_clf_bestscores["embedding_variant"]!="original-200"],
    values="not_worse_than_original", 
    index=["embedding_type"],
    columns=["embedding_variant"],
    aggfunc=lambda x: sum(x),
)

print(df_clf_sig_not_worst.to_latex(
    escape=False,
    index_names=False,
    label="tab:clf-acc-significantly-not-worse",
    caption=caption,
    columns=[
        "avgbin-200",
        "auto-128",
        "auto-256",
        "auto-512",
    ]
))

print(df_clf_sig_not_worst.mean())

df_clf_sig_not_worst

\begin{table}
\centering
\caption{Count of GEval classification datasets in which the best classifier 
of each binary embedding variant did not significantly underperfom the original one in accuracy. 
The closer to 5, the less is the performance loss. $\alpha=0.05$.}
\label{tab:clf-acc-significantly-not-worse}
\begin{tabular}{lrrrr}
\toprule
{} &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
ComplEx             &           0 &         0 &         0 &         1 \\
DistMult            &           1 &         0 &         1 &         1 \\
RDF2vec$_{CBOW-OA}$ &           3 &         2 &         2 &         2 \\
RDF2vec$_{CBOW}$    &           4 &         1 &         4 &         2 \\
RDF2vec$_{SG-OA}$   &           1 &         4 &         3 &         4 \\
RDF2vec$_{SG}$      &           2 &         1 &         2 &         4 \\
RESCAL              &           2 &         3 &         3 &         2 \\
RotatE              &           1 &         0 &         1 &         2 \\
TransE-

embedding_variant,auto-128,auto-256,auto-512,avgbin-200
embedding_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ComplEx,0,0,1,0
DistMult,0,1,1,1
RDF2vec$_{CBOW-OA}$,2,2,2,3
RDF2vec$_{CBOW}$,1,4,2,4
RDF2vec$_{SG-OA}$,4,3,4,1
RDF2vec$_{SG}$,1,2,4,2
RESCAL,3,3,2,2
RotatE,0,1,2,1
TransE-L1,3,4,4,2
TransE-L2,1,1,3,1


In [8]:
def get_latex_table_clf_acc(gold_standard_file):
    gold_standard_file_lower = str.lower(gold_standard_file).replace("_", "-")
    
    return pd.pivot_table(
        df_clf_bestscores[df_clf_bestscores["gold_standard_file"]==gold_standard_file], 
        values="score_value", 
        index=["embedding_type"],
        columns=["embedding_variant"],
        aggfunc="max",
    ).sort_values(by="original-200", ascending=False).to_latex(
        float_format="%.3f",
        escape=False,
        index_names=False,
        label=f"tab:clf-acc-{gold_standard_file_lower}",
        caption=f"Accuracy scores for best classifier of each embedding variant in dataset {gold_standard_file}",
        columns=[
            "original-200",
            "avgbin-200",
            "auto-128",
            "auto-256",
            "auto-512",
        ]
    )

In [9]:
gold_standard_file = "MetacriticMovies"

print(get_latex_table_clf_acc(gold_standard_file))

df_clf_bestscores[
    (df_clf_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clf_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Accuracy scores for best classifier of each embedding variant in dataset MetacriticMovies}
\label{tab:clf-acc-metacriticmovies}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.754 &       0.712 &     0.695 &     0.709 &     0.729 \\
RDF2vec$_{SG}$      &         0.713 &       0.689 &     0.680 &     0.690 &     0.710 \\
RDF2vec$_{SG-OA}$   &         0.712 &       0.677 &     0.696 &     0.680 &     0.692 \\
TransR              &         0.710 &       0.671 &     0.675 &     0.670 &     0.688 \\
ComplEx             &         0.695 &       0.646 &     0.642 &     0.664 &     0.679 \\
RESCAL              &         0.683 &       0.664 &     0.678 &     0.672 &     0.654 \\
DistMult            &         0.673 &       0.613 &     0.637 &     0.650 &     0.648 \\
TransE-L1           &         0.640 &       0.610 &     0.574 &     0.582 &     0.602 \\
RDF2vec$_{CB

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
10177,MetacriticMovies,0.934,SVM,accuracy,0.72887,non-rdf2vec-TransE-L2-512-autoencoded,False
8557,MetacriticMovies,0.9345,SVM,accuracy,0.711716,non-rdf2vec-TransE-L2-200-avgbin,False
9827,MetacriticMovies,0.9265,SVM,accuracy,0.709502,rdf2vec-sg-512-autoencoded,True
9637,MetacriticMovies,0.934,SVM,accuracy,0.708736,non-rdf2vec-TransE-L2-256-autoencoded,False
8797,MetacriticMovies,0.9335,SVM,accuracy,0.695825,rdf2vec-sg-oa-128-autoencoded,True
9098,MetacriticMovies,0.934,SVM,accuracy,0.695497,non-rdf2vec-TransE-L2-128-autoencoded,False
9877,MetacriticMovies,0.9335,SVM,accuracy,0.692249,rdf2vec-sg-oa-512-autoencoded,False
9287,MetacriticMovies,0.9265,SVM,accuracy,0.689638,rdf2vec-sg-256-autoencoded,False
8206,MetacriticMovies,0.927,SVM,accuracy,0.689371,rdf2vec-sg-200-avgbin,False
10227,MetacriticMovies,0.934,SVM,accuracy,0.687536,non-rdf2vec-TransR-512-autoencoded,False


In [10]:
gold_standard_file = "MetacriticAlbums"

print(get_latex_table_clf_acc(gold_standard_file))

df_clf_bestscores[
    (df_clf_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clf_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Accuracy scores for best classifier of each embedding variant in dataset MetacriticAlbums}
\label{tab:clf-acc-metacriticalbums}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.660 &       0.635 &     0.634 &     0.630 &     0.646 \\
DistMult            &         0.630 &       0.538 &     0.571 &     0.588 &     0.584 \\
ComplEx             &         0.628 &       0.548 &     0.600 &     0.599 &     0.606 \\
TransE-L1           &         0.622 &       0.596 &     0.610 &     0.614 &     0.604 \\
RESCAL              &         0.620 &       0.561 &     0.578 &     0.588 &     0.586 \\
TransR              &         0.616 &       0.537 &     0.554 &     0.544 &     0.558 \\
RDF2vec$_{SG}$      &         0.582 &       0.583 &     0.582 &     0.583 &     0.582 \\
RDF2vec$_{SG-OA}$   &         0.581 &       0.580 &     0.575 &     0.582 &     0.578 \\
RotatE      

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
10187,MetacriticAlbums,0.9475,SVM,accuracy,0.645521,non-rdf2vec-TransE-L2-512-autoencoded,True
8567,MetacriticAlbums,0.948125,SVM,accuracy,0.634736,non-rdf2vec-TransE-L2-200-avgbin,False
9107,MetacriticAlbums,0.9475,SVM,accuracy,0.634421,non-rdf2vec-TransE-L2-128-autoencoded,False
9647,MetacriticAlbums,0.9475,SVM,accuracy,0.630344,non-rdf2vec-TransE-L2-256-autoencoded,False
9591,MetacriticAlbums,0.9475,NB,accuracy,0.613797,non-rdf2vec-TransE-L1-256-autoencoded,True
9051,MetacriticAlbums,0.9475,NB,accuracy,0.610102,non-rdf2vec-TransE-L1-128-autoencoded,True
9931,MetacriticAlbums,0.9475,NB,accuracy,0.606311,non-rdf2vec-ComplEx-512-autoencoded,False
10131,MetacriticAlbums,0.9475,NB,accuracy,0.604418,non-rdf2vec-TransE-L1-512-autoencoded,True
8856,MetacriticAlbums,0.9475,SVM,accuracy,0.599649,non-rdf2vec-ComplEx-128-autoencoded,False
9397,MetacriticAlbums,0.9475,SVM,accuracy,0.599402,non-rdf2vec-ComplEx-256-autoencoded,False


In [11]:
gold_standard_file = "Forbes"

print(get_latex_table_clf_acc(gold_standard_file))

df_clf_bestscores[
    (df_clf_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clf_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Accuracy scores for best classifier of each embedding variant in dataset Forbes}
\label{tab:clf-acc-forbes}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{SG}$      &         0.625 &       0.599 &     0.592 &     0.614 &     0.629 \\
TransE-L2           &         0.603 &       0.593 &     0.587 &     0.584 &     0.592 \\
RDF2vec$_{SG-OA}$   &         0.595 &       0.570 &     0.602 &     0.585 &     0.613 \\
RESCAL              &         0.594 &       0.546 &     0.566 &     0.563 &     0.573 \\
RDF2vec$_{CBOW-OA}$ &         0.591 &       0.555 &     0.573 &     0.551 &     0.568 \\
ComplEx             &         0.569 &       0.541 &     0.536 &     0.538 &     0.545 \\
TransR              &         0.568 &       0.509 &     0.549 &     0.531 &     0.536 \\
RDF2vec$_{CBOW}$    &         0.565 &       0.551 &     0.541 &     0.562 &     0.554 \\
TransE-L1           &         0.

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
9857,Forbes,0.71041,SVM,accuracy,0.629326,rdf2vec-sg-512-autoencoded,True
9317,Forbes,0.71041,SVM,accuracy,0.613694,rdf2vec-sg-256-autoencoded,True
9907,Forbes,0.838486,SVM,accuracy,0.612719,rdf2vec-sg-oa-512-autoencoded,True
8827,Forbes,0.838486,SVM,accuracy,0.602035,rdf2vec-sg-oa-128-autoencoded,True
8237,Forbes,0.716719,SVM,accuracy,0.59935,rdf2vec-sg-200-avgbin,False
8587,Forbes,0.856151,SVM,accuracy,0.593231,non-rdf2vec-TransE-L2-200-avgbin,True
10206,Forbes,0.849842,SVM,accuracy,0.592337,non-rdf2vec-TransE-L2-512-autoencoded,True
8776,Forbes,0.71041,SVM,accuracy,0.59176,rdf2vec-sg-128-autoencoded,False
9127,Forbes,0.849842,SVM,accuracy,0.58714,non-rdf2vec-TransE-L2-128-autoencoded,True
9367,Forbes,0.838486,SVM,accuracy,0.585103,rdf2vec-sg-oa-256-autoencoded,True


In [12]:
gold_standard_file = "Cities"

print(get_latex_table_clf_acc(gold_standard_file))

df_clf_bestscores[
    (df_clf_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clf_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Accuracy scores for best classifier of each embedding variant in dataset Cities}
\label{tab:clf-acc-cities}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.806 &       0.751 &     0.747 &     0.757 &     0.760 \\
RDF2vec$_{SG-OA}$   &         0.785 &       0.717 &     0.754 &     0.759 &     0.757 \\
TransR              &         0.748 &       0.728 &     0.705 &     0.699 &     0.725 \\
RESCAL              &         0.745 &       0.732 &     0.699 &     0.707 &     0.740 \\
ComplEx             &         0.735 &       0.577 &     0.546 &     0.568 &     0.634 \\
RDF2vec$_{CBOW-OA}$ &         0.716 &       0.678 &     0.635 &     0.699 &     0.685 \\
RDF2vec$_{CBOW}$    &         0.703 &       0.686 &     0.585 &     0.668 &     0.641 \\
TransE-L1           &         0.679 &       0.611 &     0.654 &     0.631 &     0.635 \\
DistMult            &         0.

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
10167,Cities,0.933962,SVM,accuracy,0.759553,non-rdf2vec-TransE-L2-512-autoencoded,True
9328,Cities,0.933962,SVM,accuracy,0.759026,rdf2vec-sg-oa-256-autoencoded,True
9868,Cities,0.933962,SVM,accuracy,0.757395,rdf2vec-sg-oa-512-autoencoded,True
9627,Cities,0.933962,SVM,accuracy,0.757053,non-rdf2vec-TransE-L2-256-autoencoded,False
8782,Cities,0.933962,KNN,accuracy,0.754421,rdf2vec-sg-oa-128-autoencoded,True
8547,Cities,0.981132,SVM,accuracy,0.751238,non-rdf2vec-TransE-L2-200-avgbin,False
9089,Cities,0.933962,SVM,accuracy,0.747158,non-rdf2vec-TransE-L2-128-autoencoded,False
10018,Cities,0.933962,SVM,accuracy,0.740421,non-rdf2vec-RESCAL-512-autoencoded,True
8398,Cities,0.981132,SVM,accuracy,0.732286,non-rdf2vec-RESCAL-200-avgbin,True
8597,Cities,0.981132,SVM,accuracy,0.727881,non-rdf2vec-TransR-200-avgbin,True


In [13]:
gold_standard_file = "AAUP"

print(get_latex_table_clf_acc(gold_standard_file))

df_clf_bestscores[
    (df_clf_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clf_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Accuracy scores for best classifier of each embedding variant in dataset AAUP}
\label{tab:clf-acc-aaup}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{SG-OA}$   &         0.705 &       0.666 &     0.650 &     0.664 &     0.685 \\
RDF2vec$_{CBOW-OA}$ &         0.688 &       0.638 &     0.612 &     0.637 &     0.641 \\
RDF2vec$_{SG}$      &         0.687 &       0.670 &     0.632 &     0.638 &     0.681 \\
TransE-L2           &         0.658 &       0.626 &     0.611 &     0.618 &     0.628 \\
RESCAL              &         0.643 &       0.639 &     0.630 &     0.631 &     0.627 \\
RDF2vec$_{CBOW}$    &         0.630 &       0.597 &     0.562 &     0.571 &     0.589 \\
TransE-L1           &         0.630 &       0.609 &     0.606 &     0.616 &     0.618 \\
TransR              &         0.629 &       0.582 &     0.597 &     0.589 &     0.594 \\
DistMult            &         0.628 

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
9897,AAUP,0.982292,SVM,accuracy,0.684823,rdf2vec-sg-oa-512-autoencoded,True
9848,AAUP,0.616667,SVM,accuracy,0.680915,rdf2vec-sg-512-autoencoded,True
8228,AAUP,0.616667,SVM,accuracy,0.669774,rdf2vec-sg-200-avgbin,True
8277,AAUP,0.982292,SVM,accuracy,0.666479,rdf2vec-sg-oa-200-avgbin,False
9357,AAUP,0.982292,SVM,accuracy,0.664151,rdf2vec-sg-oa-256-autoencoded,False
8817,AAUP,0.982292,SVM,accuracy,0.650371,rdf2vec-sg-oa-128-autoencoded,False
9807,AAUP,0.982292,SVM,accuracy,0.641265,rdf2vec-cbow-oa-512-autoencoded,False
8427,AAUP,0.985417,SVM,accuracy,0.639342,non-rdf2vec-RESCAL-200-avgbin,True
9309,AAUP,0.616667,SVM,accuracy,0.638359,rdf2vec-sg-256-autoencoded,False
8188,AAUP,0.982292,SVM,accuracy,0.638174,rdf2vec-cbow-oa-200-avgbin,False


## Clustering

In [14]:
df_clt = df[
    (df['task_name']=='Clustering') 
    & (df['metric']=='clustering_accuracy')
    & (df['coverage']>=0.5)
].drop(columns=['test_name'])
df_clt['rank'] = df_clt.groupby(by=['test_name_short', 'gold_standard_file'])['score_value'].rank(
    method='first', 
    ascending=False,
)
df_clt_bestscores = df_clt[df_clt['rank']==1.0].drop(columns=['rank', 'task_name']).sort_values(
    by=['gold_standard_file', 'score_value'], ascending=False
)

df_clt_bestscores['gold_standard_file'].unique()

array(['teams_cluster', 'citiesMoviesAlbumsCompaniesUni_cluster',
       'citiesAndCountries_cluster', 'cities2000AndCountries_cluster'],
      dtype=object)

In [15]:
def calculate_pvalue(gold_standard_file, score_value, embedding_type):
    n = dataset_lengths.get(gold_standard_file)
    successes = round(n*score_value)
    p = df_clt_bestscores[
        (df_clt_bestscores["gold_standard_file"]==gold_standard_file)
        & (df_clt_bestscores["embedding_type"]==embedding_type)
        & (df_clt_bestscores["embedding_variant"]=="original-200")
    ]["score_value"].iloc[0]

    return binomtest(successes, n, p, alternative='less').pvalue
    
df_clt_bestscores["pvalue_worse_than_original"] = df_clt_bestscores.apply(
    lambda x: calculate_pvalue(
        x.gold_standard_file, 
        x.score_value, 
        x.embedding_type,
    ) if x.embedding_variant!="original-200" else None, 
    axis=1,
)

alpha=0.05
df_clt_bestscores["not_worse_than_original"] = df_clt_bestscores["pvalue_worse_than_original"] >= alpha

In [16]:
caption="""Count of GEval clustering datasets in which the best clusterer of each binary
embedding variant did not significantly underperfom the original one in clustering accuracy. 
The closer to 4, the less is the performance loss. $\\alpha=0.05$."""

df_clt_sig_not_worst = pd.pivot_table(
    df_clt_bestscores[df_clt_bestscores["embedding_variant"]!="original-200"],
    values="not_worse_than_original", 
    index=["embedding_type"],
    columns=["embedding_variant"],
    aggfunc=lambda x: sum(x),
)

print(df_clt_sig_not_worst.to_latex(
    escape=False,
    index_names=False,
    label="tab:clt-acc-significantly-not-worse",
    caption=caption,
    columns=[
        "avgbin-200",
        "auto-128",
        "auto-256",
        "auto-512",
    ]
))

print(df_clt_sig_not_worst.mean())

df_clt_sig_not_worst

\begin{table}
\centering
\caption{Count of GEval clustering datasets in which the best clusterer of each binary
embedding variant did not significantly underperfom the original one in clustering accuracy. 
The closer to 4, the less is the performance loss. $\alpha=0.05$.}
\label{tab:clt-acc-significantly-not-worse}
\begin{tabular}{lrrrr}
\toprule
{} &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
ComplEx             &           2 &         1 &         1 &         2 \\
DistMult            &           1 &         1 &         2 &         3 \\
RDF2vec$_{CBOW-OA}$ &           4 &         4 &         4 &         4 \\
RDF2vec$_{CBOW}$    &           2 &         2 &         2 &         2 \\
RDF2vec$_{SG-OA}$   &           2 &         2 &         4 &         3 \\
RDF2vec$_{SG}$      &           2 &         3 &         4 &         4 \\
RESCAL              &           1 &         0 &         1 &         1 \\
RotatE              &           2 &         0 &         2 &         1 \\
Tr

embedding_variant,auto-128,auto-256,auto-512,avgbin-200
embedding_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ComplEx,1,1,2,2
DistMult,1,2,3,1
RDF2vec$_{CBOW-OA}$,4,4,4,4
RDF2vec$_{CBOW}$,2,2,2,2
RDF2vec$_{SG-OA}$,2,4,3,2
RDF2vec$_{SG}$,3,4,4,2
RESCAL,0,1,1,1
RotatE,0,2,1,2
TransE-L1,1,1,2,2
TransE-L2,1,3,1,3


In [17]:
def get_latex_table_clt_acc(gold_standard_file):
    gold_standard_file_lower = str.lower(gold_standard_file).replace("_", "-")
    
    return pd.pivot_table(
        df_clt_bestscores[df_clt_bestscores["gold_standard_file"]==gold_standard_file], 
        values="score_value", 
        index=["embedding_type"],
        columns=["embedding_variant"],
        aggfunc="max",
    ).sort_values(by="original-200", ascending=False).to_latex(
        float_format="%.3f",
        escape=False,
        index_names=False,
        label=f"tab:clt-acc-{gold_standard_file_lower}",
        caption=f"Clustering accuracy scores for best clusterer of each embedding variant in dataset {gold_standard_file}",
        columns=[
            "original-200",
            "avgbin-200",
            "auto-128",
            "auto-256",
            "auto-512",
        ]
    )

In [18]:
gold_standard_file = "teams_cluster"

print(get_latex_table_clt_acc(gold_standard_file))

df_clt_bestscores[
    (df_clt_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clt_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Clustering accuracy scores for best clusterer of each embedding variant in dataset teams_cluster}
\label{tab:clt-acc-teams-cluster}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{CBOW}$    &         0.942 &       0.610 &     0.830 &     0.860 &     0.754 \\
ComplEx             &         0.942 &       0.941 &     0.939 &     0.942 &     0.942 \\
DistMult            &         0.942 &       0.830 &     0.940 &     0.942 &     0.938 \\
RotatE              &         0.942 &       0.920 &     0.833 &     0.941 &     0.844 \\
TransE-L2           &         0.942 &       0.942 &     0.942 &     0.942 &     0.942 \\
TransR              &         0.942 &       0.941 &     0.937 &     0.940 &     0.941 \\
RDF2vec$_{CBOW-OA}$ &         0.941 &       0.941 &     0.940 &     0.941 &     0.941 \\
RDF2vec$_{SG-OA}$   &         0.941 &       0.941 &     0.941 &     0.941 &     0.941 \\
TransE-L

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
4833,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.94175,non-rdf2vec-ComplEx-256-autoencoded,True
4945,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.94175,non-rdf2vec-DistMult-256-autoencoded,True
4161,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.941512,non-rdf2vec-TransE-L2-128-autoencoded,True
6625,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.941512,non-rdf2vec-TransE-L2-512-autoencoded,True
2929,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.941512,non-rdf2vec-TransE-L2-200-avgbin,True
4049,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.941512,non-rdf2vec-TransE-L1-128-autoencoded,True
5393,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.941512,non-rdf2vec-TransE-L2-256-autoencoded,True
6065,teams_cluster,0.944841,Agglomerative clustering,clustering_accuracy,0.941512,non-rdf2vec-ComplEx-512-autoencoded,True
4721,teams_cluster,0.943176,Agglomerative clustering,clustering_accuracy,0.941274,rdf2vec-sg-oa-256-autoencoded,True
3489,teams_cluster,0.943176,Agglomerative clustering,clustering_accuracy,0.941274,rdf2vec-sg-oa-128-autoencoded,True


In [19]:
gold_standard_file = "citiesMoviesAlbumsCompaniesUni_cluster"

print(get_latex_table_clt_acc(gold_standard_file))

df_clt_bestscores[
    (df_clt_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clt_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Clustering accuracy scores for best clusterer of each embedding variant in dataset citiesMoviesAlbumsCompaniesUni_cluster}
\label{tab:clt-acc-citiesmoviesalbumscompaniesuni-cluster}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.906 &       0.907 &     0.898 &     0.900 &     0.898 \\
TransE-L1           &         0.901 &       0.885 &     0.887 &     0.884 &     0.888 \\
RESCAL              &         0.894 &       0.871 &     0.863 &     0.867 &     0.852 \\
DistMult            &         0.861 &       0.862 &     0.848 &     0.877 &     0.881 \\
ComplEx             &         0.859 &       0.860 &     0.736 &     0.808 &     0.853 \\
RDF2vec$_{SG-OA}$   &         0.854 &       0.819 &     0.740 &     0.891 &     0.811 \\
RotatE              &         0.762 &       0.718 &     0.685 &     0.577 &     0.647 \\
RDF2vec$_{SG}$      &         0.749 &       0.7

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
2908,citiesMoviesAlbumsCompaniesUni_cluster,0.928268,Ward hierarchical clustering,clustering_accuracy,0.906874,non-rdf2vec-TransE-L2-200-avgbin,True
5372,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,Ward hierarchical clustering,clustering_accuracy,0.900267,non-rdf2vec-TransE-L2-256-autoencoded,True
4126,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,KMeans,clustering_accuracy,0.898065,non-rdf2vec-TransE-L2-128-autoencoded,False
6604,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,Ward hierarchical clustering,clustering_accuracy,0.897593,non-rdf2vec-TransE-L2-512-autoencoded,False
4686,citiesMoviesAlbumsCompaniesUni_cluster,0.919773,KMeans,clustering_accuracy,0.891458,rdf2vec-sg-oa-256-autoencoded,True
6478,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,KMeans,clustering_accuracy,0.88784,non-rdf2vec-TransE-L1-512-autoencoded,False
4014,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,KMeans,clustering_accuracy,0.886582,non-rdf2vec-TransE-L1-128-autoencoded,False
2782,citiesMoviesAlbumsCompaniesUni_cluster,0.928268,KMeans,clustering_accuracy,0.884537,non-rdf2vec-TransE-L1-200-avgbin,False
5246,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,KMeans,clustering_accuracy,0.884065,non-rdf2vec-TransE-L1-256-autoencoded,False
6142,citiesMoviesAlbumsCompaniesUni_cluster,0.924335,KMeans,clustering_accuracy,0.880919,non-rdf2vec-DistMult-512-autoencoded,True


In [20]:
gold_standard_file = "citiesAndCountries_cluster"

print(get_latex_table_clt_acc(gold_standard_file))

df_clt_bestscores[
    (df_clt_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clt_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Clustering accuracy scores for best clusterer of each embedding variant in dataset citiesAndCountries_cluster}
\label{tab:clt-acc-citiesandcountries-cluster}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.939 &       0.936 &     0.925 &     0.930 &     0.931 \\
TransE-L1           &         0.930 &       0.925 &     0.919 &     0.921 &     0.894 \\
RESCAL              &         0.928 &       0.867 &     0.786 &     0.854 &     0.885 \\
TransR              &         0.917 &       0.907 &     0.878 &     0.905 &     0.909 \\
ComplEx             &         0.909 &       0.874 &     0.787 &     0.787 &     0.787 \\
DistMult            &         0.896 &       0.783 &     0.787 &     0.785 &     0.784 \\
RotatE              &         0.787 &       0.782 &     0.780 &     0.781 &     0.782 \\
RDF2vec$_{CBOW-OA}$ &         0.785 &       0.935 &     0.948 &     0.9

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
4406,citiesAndCountries_cluster,0.972277,KMeans,clustering_accuracy,0.952066,rdf2vec-cbow-oa-256-autoencoded,True
3174,citiesAndCountries_cluster,0.972277,KMeans,clustering_accuracy,0.948041,rdf2vec-cbow-oa-128-autoencoded,True
2852,citiesAndCountries_cluster,0.976212,Ward hierarchical clustering,clustering_accuracy,0.935521,non-rdf2vec-TransE-L2-200-avgbin,True
1942,citiesAndCountries_cluster,0.972277,KMeans,clustering_accuracy,0.934538,rdf2vec-cbow-oa-200-avgbin,True
6534,citiesAndCountries_cluster,0.976212,KMeans,clustering_accuracy,0.93096,non-rdf2vec-TransE-L2-512-autoencoded,False
5316,citiesAndCountries_cluster,0.976212,Ward hierarchical clustering,clustering_accuracy,0.930424,non-rdf2vec-TransE-L2-256-autoencoded,False
4070,citiesAndCountries_cluster,0.976212,KMeans,clustering_accuracy,0.924879,non-rdf2vec-TransE-L2-128-autoencoded,False
2726,citiesAndCountries_cluster,0.976212,KMeans,clustering_accuracy,0.924522,non-rdf2vec-TransE-L1-200-avgbin,False
5190,citiesAndCountries_cluster,0.976212,KMeans,clustering_accuracy,0.920944,non-rdf2vec-TransE-L1-256-autoencoded,False
3972,citiesAndCountries_cluster,0.976212,Ward hierarchical clustering,clustering_accuracy,0.919335,non-rdf2vec-TransE-L1-128-autoencoded,False


In [21]:
gold_standard_file = "cities2000AndCountries_cluster"

print(get_latex_table_clt_acc(gold_standard_file))

df_clt_bestscores[
    (df_clt_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_clt_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Clustering accuracy scores for best clusterer of each embedding variant in dataset cities2000AndCountries_cluster}
\label{tab:clt-acc-cities2000andcountries-cluster}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.940 &       0.894 &     0.932 &     0.934 &     0.934 \\
RESCAL              &         0.933 &       0.928 &     0.921 &     0.930 &     0.908 \\
TransE-L1           &         0.932 &       0.927 &     0.919 &     0.925 &     0.929 \\
TransR              &         0.921 &       0.916 &     0.915 &     0.898 &     0.904 \\
RDF2vec$_{SG-OA}$   &         0.900 &       0.922 &     0.728 &     0.916 &     0.921 \\
ComplEx             &         0.897 &       0.861 &     0.835 &     0.851 &     0.863 \\
RDF2vec$_{CBOW-OA}$ &         0.894 &       0.928 &     0.937 &     0.929 &     0.938 \\
DistMult            &         0.868 &       0.841 &     0.832 &

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
5666,cities2000AndCountries_cluster,0.960175,KMeans,clustering_accuracy,0.937615,rdf2vec-cbow-oa-512-autoencoded,True
3202,cities2000AndCountries_cluster,0.960175,KMeans,clustering_accuracy,0.936924,rdf2vec-cbow-oa-128-autoencoded,True
5330,cities2000AndCountries_cluster,0.964779,KMeans,clustering_accuracy,0.934392,non-rdf2vec-TransE-L2-256-autoencoded,True
6562,cities2000AndCountries_cluster,0.964779,KMeans,clustering_accuracy,0.933932,non-rdf2vec-TransE-L2-512-autoencoded,False
4098,cities2000AndCountries_cluster,0.964779,KMeans,clustering_accuracy,0.93209,non-rdf2vec-TransE-L2-128-autoencoded,False
4994,cities2000AndCountries_cluster,0.964779,KMeans,clustering_accuracy,0.930479,non-rdf2vec-RESCAL-256-autoencoded,True
6450,cities2000AndCountries_cluster,0.964779,KMeans,clustering_accuracy,0.929328,non-rdf2vec-TransE-L1-512-autoencoded,True
4434,cities2000AndCountries_cluster,0.960175,KMeans,clustering_accuracy,0.928867,rdf2vec-cbow-oa-256-autoencoded,True
1970,cities2000AndCountries_cluster,0.960175,KMeans,clustering_accuracy,0.928407,rdf2vec-cbow-oa-200-avgbin,True
2530,cities2000AndCountries_cluster,0.964779,KMeans,clustering_accuracy,0.928177,non-rdf2vec-RESCAL-200-avgbin,True


## Regression

In [22]:
reg_dataset_folder = "evaluation_framework/Regression/data"
reg_gold_standard_files = [
    'AAUP',
    'Cities', 
    'Forbes', 
    'MetacriticAlbums', 
    'MetacriticMovies',
]
reg_gold_standard_stats = []

for reg_gold_standard_file in reg_gold_standard_files:
    stats = pd.read_csv(
        f"{reg_dataset_folder}/{reg_gold_standard_file}.tsv", 
        sep='\t',
    )["rating"].describe().to_dict()
    stats["iqr"] = stats["75%"] - stats["25%"]
    stats["range"] = stats["max"] - stats["min"]
    stats["gold_standard_file"] = reg_gold_standard_file
    reg_gold_standard_stats.append(stats)

pd.DataFrame(reg_gold_standard_stats)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,iqr,range,gold_standard_file
0,960.0,421.596875,94.256483,232.0,350.0,409.0,478.0,866.0,128.0,634.0,AAUP
1,212.0,76.021226,22.316514,23.0,57.875,77.25,99.0,106.0,41.125,83.0,Cities
2,1585.0,21.913754,36.01701,0.0,6.2,10.8,21.8,416.6,15.6,416.6,Forbes
3,1600.0,69.873125,13.951566,15.0,58.0,71.0,82.0,97.0,24.0,82.0,MetacriticAlbums
4,2000.0,51.223,22.938774,1.0,32.0,50.0,71.0,100.0,39.0,99.0,MetacriticMovies


In [23]:
reg_gold_standard_ranges = {}
for stats in reg_gold_standard_stats:
    reg_gold_standard_ranges[stats["gold_standard_file"]] = stats["range"]

reg_gold_standard_ranges

{'AAUP': 634.0,
 'Cities': 83.0,
 'Forbes': 416.6,
 'MetacriticAlbums': 82.0,
 'MetacriticMovies': 99.0}

In [24]:
df_reg = df[
    (df['task_name']=='Regression') 
    & (df['metric']=='root_mean_squared_error')
    & (df['coverage']>=0.5)
].drop(columns=['test_name'])
df_reg['rank'] = df_reg.groupby(by=['test_name_short', 'gold_standard_file'])['score_value'].rank(
    method='first', 
    ascending=True,
)
df_reg_bestscores = df_reg[df_reg['rank']==1.0].drop(columns=['rank', 'task_name']).sort_values(
    by=['gold_standard_file', 'score_value'], ascending=True
)

df_reg_bestscores['gold_standard_file'].unique()

array(['AAUP', 'Cities', 'Forbes', 'MetacriticAlbums', 'MetacriticMovies'],
      dtype=object)

In [25]:
df_reg_bestscores['nrmse'] = df_reg_bestscores['score_value'] / df_reg_bestscores['gold_standard_file'].map(
    reg_gold_standard_ranges
)
 
df_reg_bestscores['nrmse'].describe()

count    270.000000
mean       0.168645
std        0.055821
min        0.079649
25%        0.116285
50%        0.178505
75%        0.217955
max        0.273475
Name: nrmse, dtype: float64

In [26]:
def calculate_pvalue_reg(gold_standard_file, nrmse, embedding_type):
    n = dataset_lengths.get(gold_standard_file)
    successes = round(n*(1-nrmse))
    nrmse_original = df_reg_bestscores[
        (df_reg_bestscores["gold_standard_file"]==gold_standard_file)
        & (df_reg_bestscores["embedding_type"]==embedding_type)
        & (df_reg_bestscores["embedding_variant"]=="original-200")
    ]["nrmse"].iloc[0]

    return binomtest(successes, n, 1 - nrmse_original, alternative='less').pvalue
    
df_reg_bestscores["pvalue_worse_than_original"] = df_reg_bestscores.apply(
    lambda x: calculate_pvalue_reg(
        x.gold_standard_file, 
        x.nrmse, 
        x.embedding_type,
    ) if x.embedding_variant!="original-200" else None, 
    axis=1,
)

alpha=0.05
df_reg_bestscores["not_worse_than_original"] = df_reg_bestscores["pvalue_worse_than_original"] >= alpha

In [27]:
caption="""Count of GEval regression datasets in which the best classifier 
of each binary embedding variant did not significantly underperfom the original one in normalized RMSE. 
The closer to 5, the less is the performance loss. $\\alpha=0.05$."""

df_reg_sig_not_worst = pd.pivot_table(
    df_reg_bestscores[df_reg_bestscores["embedding_variant"]!="original-200"],
    values="not_worse_than_original", 
    index=["embedding_type"],
    columns=["embedding_variant"],
    aggfunc=lambda x: sum(x),
)

print(df_reg_sig_not_worst.to_latex(
    escape=False,
    index_names=False,
    label="tab:reg-nrmse-significantly-not-worse",
    caption=caption,
    columns=[
        "avgbin-200",
        "auto-128",
        "auto-256",
        "auto-512",
    ]
))

print(df_reg_sig_not_worst.mean())

df_reg_sig_not_worst

\begin{table}
\centering
\caption{Count of GEval regression datasets in which the best classifier 
of each binary embedding variant did not significantly underperfom the original one in normalized RMSE. 
The closer to 5, the less is the performance loss. $\alpha=0.05$.}
\label{tab:reg-nrmse-significantly-not-worse}
\begin{tabular}{lrrrr}
\toprule
{} &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
ComplEx             &           2 &         3 &         3 &         2 \\
DistMult            &           1 &         5 &         2 &         3 \\
RDF2vec$_{CBOW-OA}$ &           4 &         4 &         4 &         4 \\
RDF2vec$_{CBOW}$    &           5 &         5 &         4 &         5 \\
RDF2vec$_{SG-OA}$   &           5 &         4 &         4 &         4 \\
RDF2vec$_{SG}$      &           4 &         4 &         3 &         3 \\
RESCAL              &           5 &         5 &         4 &         4 \\
RotatE              &           5 &         5 &         5 &         3 \\
Tr

embedding_variant,auto-128,auto-256,auto-512,avgbin-200
embedding_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ComplEx,3,3,2,2
DistMult,5,2,3,1
RDF2vec$_{CBOW-OA}$,4,4,4,4
RDF2vec$_{CBOW}$,5,4,5,5
RDF2vec$_{SG-OA}$,4,4,4,5
RDF2vec$_{SG}$,4,3,3,4
RESCAL,5,4,4,5
RotatE,5,5,3,5
TransE-L1,5,5,2,5
TransE-L2,4,4,4,5


In [28]:
def get_latex_table_reg_rmse(gold_standard_file):
    gold_standard_file_lower = str.lower(gold_standard_file).replace("_", "-")
    
    return pd.pivot_table(
        df_reg_bestscores[df_reg_bestscores["gold_standard_file"]==gold_standard_file], 
        values="score_value", 
        index=["embedding_type"],
        columns=["embedding_variant"],
        aggfunc="min",
    ).sort_values(by="original-200", ascending=True).to_latex(
        float_format="%.2f",
        escape=False,
        index_names=False,
        label=f"tab:reg-rmse-{gold_standard_file_lower}",
        caption=f"RMSE scores for best regressor of each embedding variant in dataset {gold_standard_file}",
        columns=[
            "original-200",
            "avgbin-200",
            "auto-128",
            "auto-256",
            "auto-512",
        ]
    )

In [29]:
gold_standard_file = "AAUP"


print(get_latex_table_reg_rmse(gold_standard_file))

df_reg_bestscores[
    (df_reg_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_reg_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{RMSE scores for best regressor of each embedding variant in dataset AAUP}
\label{tab:reg-rmse-aaup}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{SG-OA}$   &         65.15 &       73.55 &     78.80 &     79.07 &     74.90 \\
TransE-L2           &         65.41 &       72.99 &     77.87 &     73.69 &     70.03 \\
RDF2vec$_{SG}$      &         65.64 &       61.41 &     64.05 &     66.33 &     65.62 \\
RDF2vec$_{CBOW-OA}$ &         67.45 &       79.40 &     79.72 &     80.61 &     78.42 \\
RESCAL              &         69.36 &       78.41 &     75.18 &     75.81 &     79.27 \\
DistMult            &         73.82 &       86.43 &     83.43 &     87.46 &     92.06 \\
ComplEx             &         76.76 &       88.78 &     85.23 &     86.93 &     91.80 \\
RDF2vec$_{CBOW}$    &         78.37 &       87.96 &     89.26 &     91.16 &     87.59 \\
TransE-L1           &         83.19 &   

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,nrmse,not_worse_than_original
6950,AAUP,0.616667,KNN,root_mean_squared_error,61.413465,rdf2vec-sg-200-avgbin,0.096867,True
7112,AAUP,0.616667,KNN,root_mean_squared_error,64.049246,rdf2vec-sg-128-autoencoded,0.101024,True
7436,AAUP,0.616667,KNN,root_mean_squared_error,65.615241,rdf2vec-sg-512-autoencoded,0.103494,True
7274,AAUP,0.616667,KNN,root_mean_squared_error,66.327004,rdf2vec-sg-256-autoencoded,0.104617,True
7541,AAUP,0.985417,KNN,root_mean_squared_error,70.034968,non-rdf2vec-TransE-L2-512-autoencoded,0.110465,True
7055,AAUP,0.985417,KNN,root_mean_squared_error,72.994073,non-rdf2vec-TransE-L2-200-avgbin,0.115133,True
6964,AAUP,0.982292,LR,root_mean_squared_error,73.549021,rdf2vec-sg-oa-200-avgbin,0.116008,True
7379,AAUP,0.985417,KNN,root_mean_squared_error,73.693062,non-rdf2vec-TransE-L2-256-autoencoded,0.116235,True
7451,AAUP,0.982292,KNN,root_mean_squared_error,74.898583,rdf2vec-sg-oa-512-autoencoded,0.118137,True
7171,AAUP,0.985417,LR,root_mean_squared_error,75.182409,non-rdf2vec-RESCAL-128-autoencoded,0.118584,True


In [30]:
gold_standard_file = "Cities"

print(get_latex_table_reg_rmse(gold_standard_file))

df_reg_bestscores[
    (df_reg_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_reg_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{RMSE scores for best regressor of each embedding variant in dataset Cities}
\label{tab:reg-rmse-cities}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         12.75 &       12.46 &     14.90 &     13.82 &     12.33 \\
RDF2vec$_{SG-OA}$   &         13.38 &       15.31 &     13.32 &     13.20 &     12.89 \\
TransR              &         13.60 &       14.47 &     14.49 &     14.79 &     15.02 \\
ComplEx             &         15.85 &       20.80 &     22.56 &     19.07 &     18.96 \\
RESCAL              &         16.72 &       14.81 &     16.72 &     16.44 &     15.87 \\
TransE-L1           &         17.12 &       19.17 &     20.21 &     19.52 &     18.42 \\
DistMult            &         18.16 &       22.24 &     21.63 &     19.88 &     19.21 \\
RDF2vec$_{CBOW}$    &         19.64 &       19.15 &     19.02 &     18.24 &     18.69 \\
RDF2vec$_{CBOW-OA}$ &         19.96 

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,nrmse,not_worse_than_original
7532,Cities,0.933962,KNN,root_mean_squared_error,12.331837,non-rdf2vec-TransE-L2-512-autoencoded,0.148576,True
7046,Cities,0.981132,KNN,root_mean_squared_error,12.45817,non-rdf2vec-TransE-L2-200-avgbin,0.150098,True
7442,Cities,0.933962,KNN,root_mean_squared_error,12.894065,rdf2vec-sg-oa-512-autoencoded,0.15535,True
7280,Cities,0.933962,KNN,root_mean_squared_error,13.199637,rdf2vec-sg-oa-256-autoencoded,0.159032,True
7118,Cities,0.933962,KNN,root_mean_squared_error,13.322632,rdf2vec-sg-oa-128-autoencoded,0.160514,True
7370,Cities,0.933962,KNN,root_mean_squared_error,13.822084,non-rdf2vec-TransE-L2-256-autoencoded,0.166531,True
7061,Cities,0.981132,KNN,root_mean_squared_error,14.469911,non-rdf2vec-TransR-200-avgbin,0.174336,True
7223,Cities,0.933962,KNN,root_mean_squared_error,14.491846,non-rdf2vec-TransR-128-autoencoded,0.174601,True
7385,Cities,0.933962,KNN,root_mean_squared_error,14.787061,non-rdf2vec-TransR-256-autoencoded,0.178157,True
7001,Cities,0.981132,KNN,root_mean_squared_error,14.805206,non-rdf2vec-RESCAL-200-avgbin,0.178376,True


In [31]:
gold_standard_file = "Forbes"


print(get_latex_table_reg_rmse(gold_standard_file))

df_reg_bestscores[
    (df_reg_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_reg_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{RMSE scores for best regressor of each embedding variant in dataset Forbes}
\label{tab:reg-rmse-forbes}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{SG}$      &         33.97 &       34.83 &     34.73 &     34.02 &     33.18 \\
ComplEx             &         36.26 &       37.91 &     37.12 &     40.73 &     38.89 \\
RESCAL              &         36.84 &       38.37 &     36.64 &     37.59 &     37.63 \\
TransE-L2           &         36.95 &       36.95 &     36.80 &     38.05 &     44.80 \\
RDF2vec$_{SG-OA}$   &         37.06 &       37.83 &     37.07 &     38.24 &     37.92 \\
DistMult            &         37.31 &       39.81 &     37.40 &     39.07 &     39.01 \\
RDF2vec$_{CBOW-OA}$ &         37.81 &       37.85 &     37.08 &     39.63 &     39.34 \\
TransE-L1           &         37.93 &       38.03 &     36.72 &     39.17 &     44.34 \\
TransR              &         38.93 

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,nrmse,not_worse_than_original
7439,Forbes,0.71041,KNN,root_mean_squared_error,33.181928,rdf2vec-sg-512-autoencoded,0.079649,True
7277,Forbes,0.71041,KNN,root_mean_squared_error,34.023913,rdf2vec-sg-256-autoencoded,0.08167,True
7115,Forbes,0.71041,KNN,root_mean_squared_error,34.727899,rdf2vec-sg-128-autoencoded,0.08336,True
6952,Forbes,0.716719,LR,root_mean_squared_error,34.829952,rdf2vec-sg-200-avgbin,0.083605,True
7174,Forbes,0.849842,LR,root_mean_squared_error,36.638688,non-rdf2vec-RESCAL-128-autoencoded,0.087947,True
7204,Forbes,0.849842,LR,root_mean_squared_error,36.716011,non-rdf2vec-TransE-L1-128-autoencoded,0.088133,True
7219,Forbes,0.849842,LR,root_mean_squared_error,36.801178,non-rdf2vec-TransE-L2-128-autoencoded,0.088337,True
7057,Forbes,0.856151,LR,root_mean_squared_error,36.951116,non-rdf2vec-TransE-L2-200-avgbin,0.088697,True
7129,Forbes,0.838486,LR,root_mean_squared_error,37.069613,rdf2vec-sg-oa-128-autoencoded,0.088981,True
7102,Forbes,0.838486,LR,root_mean_squared_error,37.077869,rdf2vec-cbow-oa-128-autoencoded,0.089001,True


In [32]:
gold_standard_file = "MetacriticMovies"

print(get_latex_table_reg_rmse(gold_standard_file))

df_reg_bestscores[
    (df_reg_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_reg_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{RMSE scores for best regressor of each embedding variant in dataset MetacriticMovies}
\label{tab:reg-rmse-metacriticmovies}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         19.88 &       20.89 &     21.01 &     21.57 &     21.31 \\
RDF2vec$_{SG}$      &         20.51 &       21.13 &     21.07 &     22.01 &     23.04 \\
RDF2vec$_{SG-OA}$   &         20.57 &       21.60 &     21.17 &     21.80 &     22.38 \\
TransR              &         20.80 &       21.66 &     21.51 &     21.87 &     23.12 \\
ComplEx             &         21.16 &       23.02 &     22.19 &     23.44 &     23.03 \\
DistMult            &         21.40 &       25.08 &     22.05 &     24.74 &     23.77 \\
RESCAL              &         21.67 &       22.23 &     21.58 &     24.22 &     24.36 \\
TransE-L1           &         22.99 &       23.16 &     22.99 &     23.91 &     25.31 \\
RDF2vec$_{CBOW-O

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,nrmse,not_worse_than_original
7048,MetacriticMovies,0.9345,LR,root_mean_squared_error,20.887112,non-rdf2vec-TransE-L2-200-avgbin,0.210981,True
7210,MetacriticMovies,0.934,LR,root_mean_squared_error,21.00652,non-rdf2vec-TransE-L2-128-autoencoded,0.212187,True
7105,MetacriticMovies,0.9265,LR,root_mean_squared_error,21.074339,rdf2vec-sg-128-autoencoded,0.212872,True
6943,MetacriticMovies,0.927,LR,root_mean_squared_error,21.13447,rdf2vec-sg-200-avgbin,0.213479,True
7120,MetacriticMovies,0.9335,LR,root_mean_squared_error,21.167528,rdf2vec-sg-oa-128-autoencoded,0.213813,True
7535,MetacriticMovies,0.934,KNN,root_mean_squared_error,21.306179,non-rdf2vec-TransE-L2-512-autoencoded,0.215214,True
7225,MetacriticMovies,0.934,LR,root_mean_squared_error,21.511273,non-rdf2vec-TransR-128-autoencoded,0.217286,True
7372,MetacriticMovies,0.934,LR,root_mean_squared_error,21.566693,non-rdf2vec-TransE-L2-256-autoencoded,0.217845,False
7165,MetacriticMovies,0.934,LR,root_mean_squared_error,21.581208,non-rdf2vec-RESCAL-128-autoencoded,0.217992,True
6958,MetacriticMovies,0.9335,LR,root_mean_squared_error,21.604017,rdf2vec-sg-oa-200-avgbin,0.218222,True


In [33]:
gold_standard_file = "MetacriticAlbums"


print(get_latex_table_reg_rmse(gold_standard_file))

df_reg_bestscores[
    (df_reg_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_reg_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{RMSE scores for best regressor of each embedding variant in dataset MetacriticAlbums}
\label{tab:reg-rmse-metacriticalbums}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         13.88 &       14.34 &     14.14 &     14.83 &     15.11 \\
DistMult            &         14.29 &       16.09 &     14.58 &     15.99 &     15.63 \\
ComplEx             &         14.41 &       14.75 &     16.11 &     15.95 &     15.85 \\
TransR              &         14.66 &       14.70 &     14.47 &     15.21 &     15.81 \\
RESCAL              &         14.75 &       14.83 &     15.79 &     15.04 &     15.75 \\
TransE-L1           &         14.83 &       14.79 &     14.35 &     15.08 &     15.97 \\
RotatE              &         15.03 &       15.04 &     14.54 &     15.53 &     15.87 \\
RDF2vec$_{SG}$      &         15.61 &       15.83 &     14.39 &     15.74 &     15.77 \\
RDF2vec$_{CBOW}$

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,nrmse,not_worse_than_original
7213,MetacriticAlbums,0.9475,LR,root_mean_squared_error,14.140001,non-rdf2vec-TransE-L2-128-autoencoded,0.172439,True
7051,MetacriticAlbums,0.948125,LR,root_mean_squared_error,14.338973,non-rdf2vec-TransE-L2-200-avgbin,0.174866,True
7198,MetacriticAlbums,0.9475,LR,root_mean_squared_error,14.351834,non-rdf2vec-TransE-L1-128-autoencoded,0.175022,True
7123,MetacriticAlbums,0.943125,LR,root_mean_squared_error,14.367842,rdf2vec-sg-oa-128-autoencoded,0.175218,True
7108,MetacriticAlbums,0.94125,LR,root_mean_squared_error,14.392115,rdf2vec-sg-128-autoencoded,0.175514,True
7228,MetacriticAlbums,0.9475,LR,root_mean_squared_error,14.471843,non-rdf2vec-TransR-128-autoencoded,0.176486,True
7183,MetacriticAlbums,0.9475,LR,root_mean_squared_error,14.5426,non-rdf2vec-RotatE-128-autoencoded,0.177349,True
7153,MetacriticAlbums,0.9475,LR,root_mean_squared_error,14.582136,non-rdf2vec-DistMult-128-autoencoded,0.177831,True
7096,MetacriticAlbums,0.943125,LR,root_mean_squared_error,14.63442,rdf2vec-cbow-oa-128-autoencoded,0.178469,True
7081,MetacriticAlbums,0.943125,LR,root_mean_squared_error,14.640437,rdf2vec-cbow-128-autoencoded,0.178542,True


## Document Similarity

In [34]:
df_docsim = df[
    (df['task_name']=='DocumentSimilarity') 
    & (df['metric']=='harmonic_mean')
    & (df['coverage']>=0.5)
].drop(columns=['test_name'])
df_docsim['rank'] = df_docsim.groupby(by=['test_name_short', 'gold_standard_file'])['score_value'].rank(
    method='first', 
    ascending=False,
)
df_docsim_bestscores = df_docsim[df_docsim['rank']==1.0].drop(columns=['rank', 'task_name']).sort_values(
    by=['gold_standard_file', 'score_value'], ascending=False
)

df_docsim_bestscores['gold_standard_file'].unique()

array(['LP50'], dtype=object)

In [35]:
df_docsim_bestscores

Unnamed: 0,gold_standard_file,coverage,model,model_configuration,metric,score_value,test_name_short,embedding_type,embedding_variant
128,LP50,0.945141,with_weights,-,harmonic_mean,0.510978,non-rdf2vec-TransR-200-avgbin,TransR,avgbin-200
62,LP50,0.945141,with_weights,-,harmonic_mean,0.505859,non-rdf2vec-TransR-200-original,TransR,original-200
104,LP50,0.945141,with_weights,-,harmonic_mean,0.484123,non-rdf2vec-RESCAL-200-avgbin,RESCAL,avgbin-200
326,LP50,0.945141,with_weights,-,harmonic_mean,0.481271,non-rdf2vec-TransR-512-autoencoded,TransR,auto-512
260,LP50,0.945141,with_weights,-,harmonic_mean,0.46638,non-rdf2vec-TransR-256-autoencoded,TransR,auto-256
224,LP50,0.945141,with_weights,-,harmonic_mean,0.463452,non-rdf2vec-ComplEx-256-autoencoded,ComplEx,auto-256
308,LP50,0.945141,with_weights,-,harmonic_mean,0.458669,non-rdf2vec-RotatE-512-autoencoded,RotatE,auto-512
44,LP50,0.945141,with_weights,-,harmonic_mean,0.457817,non-rdf2vec-RotatE-200-original,RotatE,original-200
236,LP50,0.945141,with_weights,-,harmonic_mean,0.45557,non-rdf2vec-RESCAL-256-autoencoded,RESCAL,auto-256
152,LP50,0.940439,with_weights,-,harmonic_mean,0.455101,rdf2vec-sg-oa-128-autoencoded,RDF2vec$_{SG-OA}$,auto-128


In [36]:
def calculate_pvalue_docsim(score_value, embedding_type):
    n = 1225
    successes = round(n*score_value)
    p = df_docsim_bestscores[
        (df_docsim_bestscores["gold_standard_file"]=="LP50")
        & (df_docsim_bestscores["embedding_type"]==embedding_type)
        & (df_docsim_bestscores["embedding_variant"]=="original-200")
    ]["score_value"].iloc[0]

    return binomtest(successes, n, p, alternative='less').pvalue
    
df_docsim_bestscores["pvalue_worse_than_original"] = df_docsim_bestscores.apply(
    lambda x: calculate_pvalue_docsim(
        x.score_value, 
        x.embedding_type,
    ) if x.embedding_variant!="original-200" else None, 
    axis=1,
)

alpha=0.05
df_docsim_bestscores["not_worse_than_original"] = df_docsim_bestscores["pvalue_worse_than_original"] >= alpha

In [37]:
caption="""Binary embedding variants that did not significantly underperfom the original one in harmonic mean, 
for the task of Document Similarity in the GEval dataset LP50. $\\alpha=0.05$."""

df_docsim_sig_not_worst = pd.pivot_table(
    df_docsim_bestscores[df_docsim_bestscores["embedding_variant"]!="original-200"],
    values="not_worse_than_original", 
    index=["embedding_type"],
    columns=["embedding_variant"],
    aggfunc=lambda x: sum(x),
)

print(df_docsim_sig_not_worst.to_latex(
    escape=False,
    index_names=False,
    label="tab:docsim-harmonic_mean-significantly-not-worse",
    caption=caption,
    columns=[
        "avgbin-200",
        "auto-128",
        "auto-256",
        "auto-512",
    ]
))

print(df_docsim_sig_not_worst.mean())

df_docsim_sig_not_worst

\begin{table}
\centering
\caption{Binary embedding variants that did not significantly underperfom the original one in harmonic mean, 
for the task of Document Similarity in the GEval dataset LP50. $\alpha=0.05$.}
\label{tab:docsim-harmonic_mean-significantly-not-worse}
\begin{tabular}{lllll}
\toprule
{} &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
ComplEx             &        True &     False &      True &     False \\
DistMult            &       False &     False &     False &     False \\
RDF2vec$_{CBOW-OA}$ &        True &      True &      True &      True \\
RDF2vec$_{CBOW}$    &       False &      True &      True &      True \\
RDF2vec$_{SG-OA}$   &        True &      True &      True &     False \\
RESCAL              &        True &      True &      True &     False \\
RotatE              &        True &     False &      True &      True \\
TransE-L1           &        True &     False &      True &      True \\
TransE-L2           &        True &     False & 

embedding_variant,auto-128,auto-256,auto-512,avgbin-200
embedding_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ComplEx,False,True,False,True
DistMult,False,False,False,False
RDF2vec$_{CBOW-OA}$,True,True,True,True
RDF2vec$_{CBOW}$,True,True,True,False
RDF2vec$_{SG-OA}$,True,True,False,True
RESCAL,True,True,False,True
RotatE,False,True,True,True
TransE-L1,False,True,True,True
TransE-L2,False,True,True,True
TransR,False,False,False,True


In [38]:
def get_latex_table_docsim_harm_mean(gold_standard_file):
    gold_standard_file_lower = str.lower(gold_standard_file).replace("_", "-")
    
    return pd.pivot_table(
        df_docsim_bestscores[df_docsim_bestscores["gold_standard_file"]==gold_standard_file], 
        values="score_value", 
        index=["embedding_type"],
        columns=["embedding_variant"],
        aggfunc="max",
    ).sort_values(by="original-200", ascending=False).to_latex(
        float_format="%.3f",
        escape=False,
        index_names=False,
        label=f"tab:docsim-harm-mean-{gold_standard_file_lower}",
        caption=f"Harmonic Mean scores for best model of each embedding variant in dataset {gold_standard_file}",
        columns=[
            "original-200",
            "avgbin-200",
            "auto-128",
            "auto-256",
            "auto-512",
        ]
    )

In [39]:
gold_standard_file = "LP50"

print(get_latex_table_docsim_harm_mean(gold_standard_file))

df_docsim_bestscores[
    (df_docsim_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_docsim_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Harmonic Mean scores for best model of each embedding variant in dataset LP50}
\label{tab:docsim-harm-mean-lp50}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransR              &         0.506 &       0.511 &     0.439 &     0.466 &     0.481 \\
RotatE              &         0.458 &       0.441 &     0.397 &     0.450 &     0.459 \\
TransE-L2           &         0.447 &       0.443 &     0.402 &     0.435 &     0.430 \\
DistMult            &         0.445 &       0.380 &     0.402 &     0.401 &     0.408 \\
ComplEx             &         0.442 &       0.442 &     0.417 &     0.463 &     0.397 \\
TransE-L1           &         0.435 &       0.418 &     0.407 &     0.421 &     0.439 \\
RESCAL              &         0.408 &       0.484 &     0.427 &     0.456 &     0.266 \\
RDF2vec$_{CBOW}$    &         0.352 &       0.316 &     0.358 &     0.357 &     0.356 \\
RDF2vec$_{SG-OA}$   &      

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
128,LP50,0.945141,with_weights,harmonic_mean,0.510978,non-rdf2vec-TransR-200-avgbin,True
104,LP50,0.945141,with_weights,harmonic_mean,0.484123,non-rdf2vec-RESCAL-200-avgbin,True
326,LP50,0.945141,with_weights,harmonic_mean,0.481271,non-rdf2vec-TransR-512-autoencoded,False
260,LP50,0.945141,with_weights,harmonic_mean,0.46638,non-rdf2vec-TransR-256-autoencoded,False
224,LP50,0.945141,with_weights,harmonic_mean,0.463452,non-rdf2vec-ComplEx-256-autoencoded,True
308,LP50,0.945141,with_weights,harmonic_mean,0.458669,non-rdf2vec-RotatE-512-autoencoded,True
236,LP50,0.945141,with_weights,harmonic_mean,0.45557,non-rdf2vec-RESCAL-256-autoencoded,True
152,LP50,0.940439,with_weights,harmonic_mean,0.455101,rdf2vec-sg-oa-128-autoencoded,True
242,LP50,0.945141,with_weights,harmonic_mean,0.450077,non-rdf2vec-RotatE-256-autoencoded,True
122,LP50,0.945141,with_weights,harmonic_mean,0.443065,non-rdf2vec-TransE-L2-200-avgbin,True


## Entity Relatedness

In [40]:
df_entrel = df[
    (df['task_name']=='EntityRelatedness') 
    & (df['coverage']>=0.5)
].drop(columns=['test_name'])
df_entrel['rank'] = df_entrel.groupby(by=['test_name_short', 'gold_standard_file'])['score_value'].rank(
    method='first', 
    ascending=False,
)
df_entrel_bestscores = df_entrel[df_entrel['rank']==1.0].drop(columns=['rank', 'task_name']).sort_values(
    by=['gold_standard_file', 'score_value'], ascending=False
)

df_entrel_bestscores['gold_standard_file'].unique()

array(['KORE'], dtype=object)

In [41]:
df_entrel_bestscores

Unnamed: 0,gold_standard_file,coverage,model,model_configuration,metric,score_value,test_name_short,embedding_type,embedding_variant
333,KORE,1.0,-,-,kendalltau_correlation,0.419048,rdf2vec-sg-oa-200-original,RDF2vec$_{SG-OA}$,original-200
377,KORE,1.0,-,-,kendalltau_correlation,0.401504,rdf2vec-sg-oa-512-autoencoded,RDF2vec$_{SG-OA}$,auto-512
366,KORE,1.0,-,-,kendalltau_correlation,0.376441,rdf2vec-sg-oa-256-autoencoded,RDF2vec$_{SG-OA}$,auto-256
344,KORE,1.0,-,-,kendalltau_correlation,0.34787,rdf2vec-sg-oa-200-avgbin,RDF2vec$_{SG-OA}$,avgbin-200
355,KORE,1.0,-,-,kendalltau_correlation,0.34386,rdf2vec-sg-oa-128-autoencoded,RDF2vec$_{SG-OA}$,auto-128
374,KORE,1.0,-,-,kendalltau_correlation,0.313283,rdf2vec-cbow-512-autoencoded,RDF2vec$_{CBOW}$,auto-512
341,KORE,1.0,-,-,kendalltau_correlation,0.310777,rdf2vec-cbow-200-avgbin,RDF2vec$_{CBOW}$,avgbin-200
352,KORE,1.0,-,-,kendalltau_correlation,0.297744,rdf2vec-cbow-128-autoencoded,RDF2vec$_{CBOW}$,auto-128
363,KORE,1.0,-,-,kendalltau_correlation,0.296241,rdf2vec-cbow-256-autoencoded,RDF2vec$_{CBOW}$,auto-256
330,KORE,1.0,-,-,kendalltau_correlation,0.289724,rdf2vec-cbow-200-original,RDF2vec$_{CBOW}$,original-200


In [42]:
def calculate_pvalue_entrel(score_value, embedding_type):
    n = 441
    successes = round(n*score_value)
    p = df_entrel_bestscores[
        (df_entrel_bestscores["gold_standard_file"]=="KORE")
        & (df_entrel_bestscores["embedding_type"]==embedding_type)
        & (df_entrel_bestscores["embedding_variant"]=="original-200")
    ]["score_value"].iloc[0]

    return binomtest(successes, n, p, alternative='less').pvalue
    
df_entrel_bestscores["pvalue_worse_than_original"] = df_entrel_bestscores.apply(
    lambda x: calculate_pvalue_entrel( 
        x.score_value, 
        x.embedding_type,
    ) if x.embedding_variant!="original-200" else None, 
    axis=1,
)

alpha=0.05
df_entrel_bestscores["not_worse_than_original"] = df_entrel_bestscores["pvalue_worse_than_original"] >= alpha

In [43]:
caption="""Binary embedding variants that did not significantly underperfom the original one in Kendall's Tau correlation, 
for the task of Entity Relatedness in the GEval dataset KORE.  $\\alpha=0.05$."""

df_entrel_sig_not_worst = pd.pivot_table(
    df_entrel_bestscores[df_entrel_bestscores["embedding_variant"]!="original-200"],
    values="not_worse_than_original", 
    index=["embedding_type"],
    columns=["embedding_variant"],
    aggfunc=lambda x: sum(x),
)

print(df_entrel_sig_not_worst.to_latex(
    escape=False,
    index_names=False,
    label="tab:entrel-tau-significantly-not-worse",
    caption=caption,
    columns=[
        "avgbin-200",
        "auto-128",
        "auto-256",
        "auto-512",
    ]
))

print(df_entrel_sig_not_worst.mean())

df_entrel_sig_not_worst

\begin{table}
\centering
\caption{Binary embedding variants that did not significantly underperfom the original one in Kendall's Tau correlation, 
for the task of Entity Relatedness in the GEval dataset KORE.  $\alpha=0.05$.}
\label{tab:entrel-tau-significantly-not-worse}
\begin{tabular}{lllll}
\toprule
{} &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
ComplEx             &        True &      True &      True &      True \\
DistMult            &        True &      True &      True &      True \\
RDF2vec$_{CBOW-OA}$ &        True &      True &      True &      True \\
RDF2vec$_{CBOW}$    &        True &      True &      True &      True \\
RDF2vec$_{SG-OA}$   &       False &     False &     False &      True \\
RDF2vec$_{SG}$      &        True &      True &      True &      True \\
RESCAL              &        True &      True &     False &      True \\
RotatE              &        True &      True &      True &      True \\
TransE-L1           &        True &      True 

embedding_variant,auto-128,auto-256,auto-512,avgbin-200
embedding_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ComplEx,True,True,True,True
DistMult,True,True,True,True
RDF2vec$_{CBOW-OA}$,True,True,True,True
RDF2vec$_{CBOW}$,True,True,True,True
RDF2vec$_{SG-OA}$,False,False,True,False
RDF2vec$_{SG}$,True,True,True,True
RESCAL,True,False,True,True
RotatE,True,True,True,True
TransE-L1,True,True,True,True
TransE-L2,True,True,True,True


In [44]:
def get_latex_table_entrel_tau(gold_standard_file):
    gold_standard_file_lower = str.lower(gold_standard_file).replace("_", "-")
    
    return pd.pivot_table(
        df_entrel_bestscores[df_entrel_bestscores["gold_standard_file"]==gold_standard_file], 
        values="score_value", 
        index=["embedding_type"],
        columns=["embedding_variant"],
        aggfunc="max",
    ).sort_values(by="original-200", ascending=False).to_latex(
        float_format="%.3f",
        escape=False,
        index_names=False,
        label=f"tab:entrel-tau-{gold_standard_file_lower}",
        caption=f"Kendall's Tau correlation scores for each embedding variant in dataset {gold_standard_file}",
        columns=[
            "original-200",
            "avgbin-200",
            "auto-128",
            "auto-256",
            "auto-512",
        ]
    )

In [45]:
gold_standard_file = "KORE"

print(get_latex_table_entrel_tau(gold_standard_file))

df_entrel_bestscores[
    (df_entrel_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_entrel_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Kendall's Tau correlation scores for each embedding variant in dataset KORE}
\label{tab:entrel-tau-kore}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{SG-OA}$   &         0.419 &       0.348 &     0.344 &     0.376 &     0.402 \\
RDF2vec$_{CBOW}$    &         0.290 &       0.311 &     0.298 &     0.296 &     0.313 \\
ComplEx             &         0.234 &       0.218 &     0.211 &     0.204 &     0.217 \\
TransR              &         0.227 &       0.213 &     0.222 &     0.241 &     0.269 \\
RDF2vec$_{SG}$      &         0.189 &       0.182 &     0.180 &     0.186 &     0.187 \\
RESCAL              &         0.188 &       0.183 &     0.168 &     0.139 &     0.229 \\
DistMult            &         0.130 &       0.138 &     0.188 &     0.155 &     0.206 \\
RDF2vec$_{CBOW-OA}$ &         0.121 &       0.128 &     0.128 &     0.134 &     0.113 \\
TransE-L1           &         0.095

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
377,KORE,1.0,-,kendalltau_correlation,0.401504,rdf2vec-sg-oa-512-autoencoded,True
366,KORE,1.0,-,kendalltau_correlation,0.376441,rdf2vec-sg-oa-256-autoencoded,False
344,KORE,1.0,-,kendalltau_correlation,0.34787,rdf2vec-sg-oa-200-avgbin,False
355,KORE,1.0,-,kendalltau_correlation,0.34386,rdf2vec-sg-oa-128-autoencoded,False
374,KORE,1.0,-,kendalltau_correlation,0.313283,rdf2vec-cbow-512-autoencoded,True
341,KORE,1.0,-,kendalltau_correlation,0.310777,rdf2vec-cbow-200-avgbin,True
352,KORE,1.0,-,kendalltau_correlation,0.297744,rdf2vec-cbow-128-autoencoded,True
363,KORE,1.0,-,kendalltau_correlation,0.296241,rdf2vec-cbow-256-autoencoded,True
384,KORE,1.0,-,kendalltau_correlation,0.269173,non-rdf2vec-TransR-512-autoencoded,True
373,KORE,1.0,-,kendalltau_correlation,0.240602,non-rdf2vec-TransR-256-autoencoded,True


## Semantic Analogies

In [46]:
df_semana = df[
    (df['task_name']=='SemanticAnalogies') 
    & (df['coverage']>=0.5)
].drop(columns=['test_name'])
df_semana['rank'] = df_semana.groupby(by=['test_name_short', 'gold_standard_file'])['score_value'].rank(
    method='first', 
    ascending=False,
)
df_semana_bestscores = df_semana[df_semana['rank']==1.0].drop(columns=['rank', 'task_name']).sort_values(
    by=['gold_standard_file', 'score_value'], ascending=False
)

df_semana_bestscores['gold_standard_file'].unique()

array(['currency_entities', 'city_state_entities',
       'capital_country_entities', 'all_capital_country_entities'],
      dtype=object)

In [47]:
def calculate_pvalue_semana(gold_standard_file, score_value, embedding_type):
    n = dataset_lengths.get(gold_standard_file)
    successes = round(n*score_value)
    p = df_semana_bestscores[
        (df_semana_bestscores["gold_standard_file"]==gold_standard_file)
        & (df_semana_bestscores["embedding_type"]==embedding_type)
        & (df_semana_bestscores["embedding_variant"]=="original-200")
    ]["score_value"].iloc[0]

    return binomtest(successes, n, p, alternative='less').pvalue
    
df_semana_bestscores["pvalue_worse_than_original"] = df_semana_bestscores.apply(
    lambda x: calculate_pvalue_semana(
        x.gold_standard_file, 
        x.score_value, 
        x.embedding_type,
    ) if x.embedding_variant!="original-200" else None, 
    axis=1,
)

alpha=0.05
df_semana_bestscores["not_worse_than_original"] = df_semana_bestscores["pvalue_worse_than_original"] >= alpha

In [48]:
caption="""Count of GEval Semantic Analogies datasets in which the best model 
of each binary embedding variant did not significantly underperfom the original one in precision@10. 
The closer to 4, the less is the performance loss. $\\alpha=0.05$."""

df_semana_sig_not_worst = pd.pivot_table(
    df_semana_bestscores[df_semana_bestscores["embedding_variant"]!="original-200"],
    values="not_worse_than_original", 
    index=["embedding_type"],
    columns=["embedding_variant"],
    aggfunc=lambda x: sum(x),
)

print(df_semana_sig_not_worst.to_latex(
    escape=False,
    index_names=False,
    label="tab:semana-p-at-10-significantly-not-worse",
    caption=caption,
    columns=[
        "avgbin-200",
        "auto-128",
        "auto-256",
        "auto-512",
    ]
))

print(df_semana_sig_not_worst.mean())

df_semana_sig_not_worst

\begin{table}
\centering
\caption{Count of GEval Semantic Analogies datasets in which the best model 
of each binary embedding variant did not significantly underperfom the original one in precision@10. 
The closer to 4, the less is the performance loss. $\alpha=0.05$.}
\label{tab:semana-p-at-10-significantly-not-worse}
\begin{tabular}{lrrrr}
\toprule
{} &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
ComplEx             &           0 &         0 &         0 &         0 \\
DistMult            &           1 &         1 &         1 &         1 \\
RDF2vec$_{CBOW-OA}$ &           0 &         0 &         0 &         0 \\
RDF2vec$_{CBOW}$    &           0 &         0 &         0 &         0 \\
RDF2vec$_{SG-OA}$   &           1 &         0 &         0 &         1 \\
RESCAL              &           3 &         2 &         2 &         2 \\
RotatE              &           1 &         1 &         1 &         2 \\
TransE-L1           &           0 &         0 &         0 &         1 

embedding_variant,auto-128,auto-256,auto-512,avgbin-200
embedding_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ComplEx,0,0,0,0
DistMult,1,1,1,1
RDF2vec$_{CBOW-OA}$,0,0,0,0
RDF2vec$_{CBOW}$,0,0,0,0
RDF2vec$_{SG-OA}$,0,0,1,1
RESCAL,2,2,2,3
RotatE,1,1,2,1
TransE-L1,0,0,1,0
TransE-L2,0,0,1,1
TransR,0,0,2,1


In [49]:
def get_latex_table_semana(gold_standard_file):
    gold_standard_file_lower = str.lower(gold_standard_file).replace("_", "-")
    
    return pd.pivot_table(
        df_semana_bestscores[df_semana_bestscores["gold_standard_file"]==gold_standard_file], 
        values="score_value", 
        index=["embedding_type"],
        columns=["embedding_variant"],
        aggfunc="max",
    ).sort_values(by="original-200", ascending=False).to_latex(
        float_format="%.3f",
        escape=False,
        index_names=False,
        label=f"tab:semana-p-at-10-{gold_standard_file_lower}",
        caption=f"Precision at 10 scores for best model of each embedding variant in dataset {gold_standard_file}",
        columns=[
            "original-200",
            "avgbin-200",
            "auto-128",
            "auto-256",
            "auto-512",
        ]
    )

In [50]:
gold_standard_file = "currency_entities"

print(get_latex_table_semana(gold_standard_file))

df_semana_bestscores[
    (df_semana_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_semana_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Precision at 10 scores for best model of each embedding variant in dataset currency_entities}
\label{tab:semana-p-at-10-currency-entities}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         0.743 &       0.642 &     0.439 &     0.589 &     0.632 \\
RDF2vec$_{SG-OA}$   &         0.637 &       0.544 &     0.441 &     0.448 &     0.533 \\
RDF2vec$_{CBOW-OA}$ &         0.582 &       0.374 &     0.195 &     0.378 &     0.322 \\
TransE-L1           &         0.504 &       0.114 &     0.073 &     0.105 &     0.273 \\
RDF2vec$_{CBOW}$    &         0.488 &       0.338 &     0.160 &     0.287 &     0.343 \\
TransR              &         0.308 &       0.231 &     0.148 &     0.173 &     0.246 \\
ComplEx             &         0.101 &       0.017 &     0.004 &     0.023 &     0.017 \\
RESCAL              &         0.079 &       0.111 &     0.056 &     0.040 &     0.035 \\
D

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
459,currency_entities,0.933025,-,accuracy,0.642327,non-rdf2vec-TransE-L2-200-avgbin,True
579,currency_entities,0.86836,-,accuracy,0.631649,non-rdf2vec-TransE-L2-512-autoencoded,True
539,currency_entities,0.86836,-,accuracy,0.589096,non-rdf2vec-TransE-L2-256-autoencoded,False
435,currency_entities,0.86836,-,accuracy,0.543883,rdf2vec-sg-oa-200-avgbin,True
555,currency_entities,0.86836,-,accuracy,0.533245,rdf2vec-sg-oa-512-autoencoded,True
515,currency_entities,0.86836,-,accuracy,0.448138,rdf2vec-sg-oa-256-autoencoded,False
475,currency_entities,0.86836,-,accuracy,0.441489,rdf2vec-sg-oa-128-autoencoded,False
499,currency_entities,0.86836,-,accuracy,0.43883,non-rdf2vec-TransE-L2-128-autoencoded,False
511,currency_entities,0.86836,-,accuracy,0.37766,rdf2vec-cbow-oa-256-autoencoded,False
431,currency_entities,0.86836,-,accuracy,0.37367,rdf2vec-cbow-oa-200-avgbin,False


In [51]:
gold_standard_file = "city_state_entities"

print(get_latex_table_semana(gold_standard_file))

df_semana_bestscores[
    (df_semana_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_semana_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Precision at 10 scores for best model of each embedding variant in dataset city_state_entities}
\label{tab:semana-p-at-10-city-state-entities}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
RDF2vec$_{SG-OA}$   &         0.738 &       0.603 &     0.502 &     0.455 &     0.512 \\
TransR              &         0.620 &       0.419 &     0.383 &     0.432 &     0.520 \\
TransE-L1           &         0.610 &       0.432 &     0.373 &     0.471 &     0.541 \\
RDF2vec$_{CBOW}$    &         0.596 &       0.485 &     0.246 &     0.385 &     0.474 \\
TransE-L2           &         0.590 &       0.480 &     0.281 &     0.354 &     0.387 \\
ComplEx             &         0.578 &       0.315 &     0.252 &     0.388 &     0.437 \\
RDF2vec$_{CBOW-OA}$ &         0.568 &       0.369 &     0.277 &     0.391 &     0.275 \\
DistMult            &         0.541 &       0.278 &     0.251 &     0.355 &     0.448 

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
436,city_state_entities,0.888529,-,accuracy,0.602646,rdf2vec-sg-oa-200-avgbin,False
576,city_state_entities,0.943251,-,accuracy,0.54104,non-rdf2vec-TransE-L1-512-autoencoded,True
584,city_state_entities,0.943251,-,accuracy,0.519553,non-rdf2vec-TransR-512-autoencoded,False
556,city_state_entities,0.888529,-,accuracy,0.512318,rdf2vec-sg-oa-512-autoencoded,False
476,city_state_entities,0.888529,-,accuracy,0.502281,rdf2vec-sg-oa-128-autoencoded,False
428,city_state_entities,0.888529,-,accuracy,0.484945,rdf2vec-cbow-200-avgbin,False
460,city_state_entities,0.943251,-,accuracy,0.480447,non-rdf2vec-TransE-L2-200-avgbin,False
548,city_state_entities,0.888529,-,accuracy,0.474453,rdf2vec-cbow-512-autoencoded,False
536,city_state_entities,0.943251,-,accuracy,0.471422,non-rdf2vec-TransE-L1-256-autoencoded,False
516,city_state_entities,0.888529,-,accuracy,0.454836,rdf2vec-sg-oa-256-autoencoded,False


In [52]:
gold_standard_file = "capital_country_entities"

print(get_latex_table_semana(gold_standard_file))

df_semana_bestscores[
    (df_semana_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_semana_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Precision at 10 scores for best model of each embedding variant in dataset capital_country_entities}
\label{tab:semana-p-at-10-capital-country-entities}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransE-L2           &         1.000 &       0.957 &     0.775 &     0.949 &     0.982 \\
TransR              &         1.000 &       0.915 &     0.889 &     0.933 &     0.962 \\
ComplEx             &         0.994 &       0.725 &     0.470 &     0.800 &     0.927 \\
TransE-L1           &         0.994 &       0.943 &     0.698 &     0.812 &     0.939 \\
DistMult            &         0.988 &       0.775 &     0.545 &     0.836 &     0.927 \\
RDF2vec$_{SG-OA}$   &         0.986 &       0.854 &     0.749 &     0.609 &     0.804 \\
RDF2vec$_{CBOW}$    &         0.909 &       0.690 &     0.346 &     0.542 &     0.727 \\
RDF2vec$_{CBOW-OA}$ &         0.907 &       0.500 &     0.342 &     0.692 & 

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
577,capital_country_entities,1.0,-,accuracy,0.982213,non-rdf2vec-TransE-L2-512-autoencoded,False
581,capital_country_entities,1.0,-,accuracy,0.962451,non-rdf2vec-TransR-512-autoencoded,False
457,capital_country_entities,1.0,-,accuracy,0.956522,non-rdf2vec-TransE-L2-200-avgbin,False
537,capital_country_entities,1.0,-,accuracy,0.948617,non-rdf2vec-TransE-L2-256-autoencoded,False
453,capital_country_entities,1.0,-,accuracy,0.942688,non-rdf2vec-TransE-L1-200-avgbin,False
573,capital_country_entities,1.0,-,accuracy,0.938735,non-rdf2vec-TransE-L1-512-autoencoded,False
541,capital_country_entities,1.0,-,accuracy,0.932806,non-rdf2vec-TransR-256-autoencoded,False
557,capital_country_entities,1.0,-,accuracy,0.926877,non-rdf2vec-ComplEx-512-autoencoded,False
561,capital_country_entities,1.0,-,accuracy,0.926877,non-rdf2vec-DistMult-512-autoencoded,False
461,capital_country_entities,1.0,-,accuracy,0.91502,non-rdf2vec-TransR-200-avgbin,False


In [53]:
gold_standard_file = "all_capital_country_entities"

print(get_latex_table_semana(gold_standard_file))

df_semana_bestscores[
    (df_semana_bestscores["gold_standard_file"]==gold_standard_file)
    & (df_semana_bestscores["embedding_variant"]!="original-200")
].drop(columns=[
    "model_configuration",
    "embedding_type",
    "embedding_variant",
    "pvalue_worse_than_original",
])

\begin{table}
\centering
\caption{Precision at 10 scores for best model of each embedding variant in dataset all_capital_country_entities}
\label{tab:semana-p-at-10-all-capital-country-entities}
\begin{tabular}{lrrrrr}
\toprule
{} &  original-200 &  avgbin-200 &  auto-128 &  auto-256 &  auto-512 \\
\midrule
TransR              &         0.971 &       0.914 &     0.806 &     0.925 &     0.968 \\
TransE-L2           &         0.968 &       0.919 &     0.603 &     0.800 &     0.931 \\
ComplEx             &         0.968 &       0.782 &     0.586 &     0.858 &     0.940 \\
TransE-L1           &         0.961 &       0.863 &     0.641 &     0.753 &     0.907 \\
DistMult            &         0.960 &       0.740 &     0.578 &     0.877 &     0.929 \\
RDF2vec$_{SG-OA}$   &         0.944 &       0.844 &     0.788 &     0.683 &     0.809 \\
RDF2vec$_{CBOW-OA}$ &         0.852 &       0.471 &     0.267 &     0.561 &     0.266 \\
RotatE              &         0.808 &       0.518 &     0.289 &     

Unnamed: 0,gold_standard_file,coverage,model,metric,score_value,test_name_short,not_worse_than_original
582,all_capital_country_entities,0.982759,-,accuracy,0.968061,non-rdf2vec-TransR-512-autoencoded,True
558,all_capital_country_entities,0.982759,-,accuracy,0.940396,non-rdf2vec-ComplEx-512-autoencoded,False
578,all_capital_country_entities,0.982759,-,accuracy,0.930724,non-rdf2vec-TransE-L2-512-autoencoded,False
562,all_capital_country_entities,0.982759,-,accuracy,0.929375,non-rdf2vec-DistMult-512-autoencoded,False
542,all_capital_country_entities,0.982759,-,accuracy,0.924876,non-rdf2vec-TransR-256-autoencoded,False
458,all_capital_country_entities,0.982759,-,accuracy,0.918803,non-rdf2vec-TransE-L2-200-avgbin,False
462,all_capital_country_entities,0.982759,-,accuracy,0.91408,non-rdf2vec-TransR-200-avgbin,False
574,all_capital_country_entities,0.982759,-,accuracy,0.906883,non-rdf2vec-TransE-L1-512-autoencoded,False
522,all_capital_country_entities,0.982759,-,accuracy,0.876518,non-rdf2vec-DistMult-256-autoencoded,False
454,all_capital_country_entities,0.982759,-,accuracy,0.863248,non-rdf2vec-TransE-L1-200-avgbin,False
