<a href="https://colab.research.google.com/github/vitor-faria/kgembeddings-binarization/blob/main/notebooks/get_original_vectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Download original vectors

In [1]:
!wget https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-cbow-200/vectors.txt -O rdf2vec-cbow-200.txt

--2023-12-07 16:46:19--  https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-cbow-200/vectors.txt
Resolving data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)... 134.155.95.56
Connecting to data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)|134.155.95.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19999281123 (19G) [text/plain]
Saving to: ‘rdf2vec-cbow-200.txt’


2023-12-07 17:01:08 (21.5 MB/s) - ‘rdf2vec-cbow-200.txt’ saved [19999281123/19999281123]



## Load vectors as Gensim models

In [2]:
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

In [4]:
word_vectors = KeyedVectors.load_word2vec_format(
    'rdf2vec-cbow-200.txt',
    no_header=True,
)

## Retrieve GEval and DLCC entities to filter

In [6]:
!git clone https://github.com/vitor-faria/kgembeddings-binarization.git binarizer

Cloning into 'binarizer'...
remote: Enumerating objects: 72, done.[K
remote: Counting objects: 100% (72/72), done.[K
remote: Compressing objects: 100% (63/63), done.[K
remote: Total 72 (delta 19), reused 0 (delta 0), pack-reused 0[K
Receiving objects: 100% (72/72), 10.70 MiB | 6.65 MiB/s, done.
Resolving deltas: 100% (19/19), done.


In [7]:
import pickle

with open('binarizer/resources/entities/dlcc_entities.pickle', 'rb') as file:
    dlcc_entities = pickle.load(file)

with open('binarizer/resources/entities/geval_entities.pickle', 'rb') as file:
    geval_entities = pickle.load(file)

all_entities = dlcc_entities.union(geval_entities)

## Binarize using naïve approach (mean)

In [5]:
import numpy as np

avg_embeddings = np.mean(word_vectors.vectors, axis=0)
print(len(avg_embeddings))
avg_embeddings

200


array([ 2.66459398e-02, -4.05602306e-02,  6.76070713e-03,  5.85619882e-02,
       -1.87901273e-01,  1.01783574e-01, -8.62669200e-02,  1.86304413e-02,
        1.09569766e-01,  3.90231051e-02,  2.62588505e-02, -4.17908281e-02,
        6.78481460e-02,  3.98457982e-02,  7.19418079e-02,  8.03387687e-02,
        3.32491170e-03,  9.18856487e-02, -2.27381196e-02, -9.93786678e-02,
        1.18481264e-01, -1.56398028e-01, -5.99309616e-02, -9.92688984e-02,
        2.32945699e-02,  2.85747480e-02,  3.04487050e-02,  9.09564346e-02,
       -5.15962169e-02,  4.55833077e-02, -2.70870719e-02, -5.22377342e-02,
        1.47228166e-01,  3.77177224e-02, -1.09447509e-01,  2.67860424e-02,
        1.48335854e-02,  3.29882801e-02,  3.74631099e-02,  2.37042550e-02,
       -3.55878025e-02,  6.52843043e-02, -2.24691462e-02,  1.09397732e-01,
        1.62295718e-02, -1.07029431e-01, -5.39515950e-02,  1.11807272e-01,
        1.71718776e-01, -8.45847577e-02,  6.49282262e-02,  2.50496678e-02,
       -1.29109919e-01, -

## Write file and save in Google Drive

In [8]:
with open('rdf2vec-cbow-200-original.txt', "w") as file:
    rows_count = 0
    for i in range(len(word_vectors.index_to_key)):
        token = word_vectors.index_to_key[i]
        if token.startswith('dbr:'):
            token = token.replace('dbr:', 'http://dbpedia.org/resource/')
        if token in all_entities:
            vector_string = ' '.join([str(x) for x in word_vectors.vectors[i].tolist()])
            file.write(f'{token} {vector_string} \n')
            rows_count += 1

rows_count

393595

In [10]:
!cp rdf2vec-cbow-200-original.txt "/content/drive/MyDrive/Master/Mannheim/Master-Thesis/Embeddings"

In [9]:
bin_model_vectors = np.greater_equal(word_vectors.vectors, avg_embeddings)

with open('rdf2vec-cbow-200-avgbin.txt', "w") as file:
    rows_count = 0
    for i in range(len(word_vectors.index_to_key)):
        token = word_vectors.index_to_key[i]
        if token.startswith('dbr:'):
            token = token.replace('dbr:', 'http://dbpedia.org/resource/')
        if token in all_entities:
            bin_vector_string = ' '.join([str(x) for x in (bin_model_vectors[i]*1).tolist()])
            file.write(f'{token} {bin_vector_string} \n')
            rows_count += 1

rows_count

393595

In [11]:
!cp rdf2vec-cbow-200-avgbin.txt "/content/drive/MyDrive/Master/Mannheim/Master-Thesis/Embeddings"

## Delete variables and temp files

In [12]:
del word_vectors
del avg_embeddings
del bin_model_vectors

In [14]:
!rm rdf2vec-cbow-200.txt
!rm rdf2vec-cbow-200-original.txt
!rm rdf2vec-cbow-200-avgbin.txt

## Generic code to be used for all source vectors

In [22]:
name = 'rdf2vec-cbow-oa-200'

if not os.path.isfile(f'{name}.txt'):
    !wget {source} -O {name}.txt

try:
    word_vectors = KeyedVectors.load_word2vec_format(
        f'{name}.txt',
        no_header=True,
    )
except:
    word_vectors = KeyedVectors.load_word2vec_format(
        f'{name}.txt',
        no_header=False,
        unicode_errors='ignore',
    )

print('number of entities:', len(word_vectors.index_to_key))
avg_embeddings = np.mean(word_vectors.vectors, axis=0)
print('number of dimensions:', len(avg_embeddings))

bin_model_vectors = np.greater_equal(word_vectors.vectors, avg_embeddings)

with open(f'{name}-avgbin.txt', "w") as file:
    rows_count = 0
    for i in range(len(word_vectors.index_to_key)):
        token = str(word_vectors.index_to_key[i])
        if token.startswith('dbr:'):
            token = token.replace('dbr:', 'http://dbpedia.org/resource/')
        if token in all_entities:
            bin_vector_string = ' '.join([str(x) for x in (bin_model_vectors[i]*1).tolist()])
            file.write(f'{token} {bin_vector_string} \n')
            rows_count += 1

print(f'{name}-avgbin.txt has {rows_count} entities')

!cp {name}-avgbin.txt "/content/drive/MyDrive/Master/Mannheim/Master-Thesis/Embeddings"
del bin_model_vectors
!rm {name}-avgbin.txt

with open(f'{name}-original.txt', "w") as file:
    rows_count = 0
    for i in range(len(word_vectors.index_to_key)):
        token = str(word_vectors.index_to_key[i])
        if token.startswith('dbr:'):
            token = token.replace('dbr:', 'http://dbpedia.org/resource/')
        if token in all_entities:
            vector_string = ' '.join([str(x) for x in word_vectors.vectors[i].tolist()])
            file.write(f'{token} {vector_string} \n')
            rows_count += 1

print(f'{name}-original.txt has {rows_count} entities')
!cp {name}-original.txt "/content/drive/MyDrive/Master/Mannheim/Master-Thesis/Embeddings"
!rm {name}.txt
del word_vectors
del avg_embeddings
!rm {name}-original.txt

rdf2vec-cbow-oa-200-avgbin.txt has 370369 entities
rdf2vec-cbow-oa-200-original.txt has 370369 entities


## For loop to repeat for all embeddings

In [23]:
embeddings_source_files = {
    # "rdf2vec-cbow-200": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-cbow-200/vectors.txt",
    # "rdf2vec-cbow-oa-200": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-cbow-oa-200/cwindow200_classic.txt",
    "rdf2vec-sg-200": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-sg-200/vectors.txt",
    "rdf2vec-sg-oa-200": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-sg-oa-200/sgpos200_classic.txt",
    "non-rdf2vec-ComplEx": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_ComplEx.txt",
    "non-rdf2vec-DistMult": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_DistMult.txt",
    "non-rdf2vec-RESCAL": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_RESCAL.txt",
    "non-rdf2vec-RotatE": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_RotatE.txt",
    "non-rdf2vec-TransE-L1": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_TransE-L1.txt",
    "non-rdf2vec-TransE-L2": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_TransE-L2.txt",
    "non-rdf2vec-TransR": "https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_TransR.txt",
}

In [24]:
import os.path


for name, source in embeddings_source_files.items():
    if not os.path.isfile(f'{name}.txt'):
        !wget {source} -O {name}.txt

    try:
        word_vectors = KeyedVectors.load_word2vec_format(
            f'{name}.txt',
            no_header=True,
        )
    except:
        word_vectors = KeyedVectors.load_word2vec_format(
            f'{name}.txt',
            no_header=False,
            unicode_errors='ignore',
        )

    print('number of entities:', len(word_vectors.index_to_key))
    avg_embeddings = np.mean(word_vectors.vectors, axis=0)
    print('number of dimensions:', len(avg_embeddings))

    bin_model_vectors = np.greater_equal(word_vectors.vectors, avg_embeddings)

    with open(f'{name}-avgbin.txt', "w") as file:
        rows_count = 0
        for i in range(len(word_vectors.index_to_key)):
            token = str(word_vectors.index_to_key[i])
            if token.startswith('dbr:'):
                token = token.replace('dbr:', 'http://dbpedia.org/resource/')
            if token in all_entities:
                bin_vector_string = ' '.join([str(x) for x in (bin_model_vectors[i]*1).tolist()])
                file.write(f'{token} {bin_vector_string} \n')
                rows_count += 1

    print(f'{name}-avgbin.txt has {rows_count} entities')

    !cp {name}-avgbin.txt "/content/drive/MyDrive/Master/Mannheim/Master-Thesis/Embeddings"
    del bin_model_vectors
    !rm {name}-avgbin.txt

    with open(f'{name}-original.txt', "w") as file:
        rows_count = 0
        for i in range(len(word_vectors.index_to_key)):
            token = str(word_vectors.index_to_key[i])
            if token.startswith('dbr:'):
                token = token.replace('dbr:', 'http://dbpedia.org/resource/')
            if token in all_entities:
                vector_string = ' '.join([str(x) for x in word_vectors.vectors[i].tolist()])
                file.write(f'{token} {vector_string} \n')
                rows_count += 1

    print(f'{name}-original.txt has {rows_count} entities')
    !cp {name}-original.txt "/content/drive/MyDrive/Master/Mannheim/Master-Thesis/Embeddings"
    !rm {name}.txt
    !rm {name}-original.txt
    del word_vectors
    del avg_embeddings


--2023-12-07 19:05:09--  https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-sg-200/vectors.txt
Resolving data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)... 134.155.95.56
Connecting to data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)|134.155.95.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19551833216 (18G) [text/plain]
Saving to: ‘rdf2vec-sg-200.txt’


2023-12-07 19:19:26 (21.8 MB/s) - ‘rdf2vec-sg-200.txt’ saved [19551833216/19551833216]

number of entities: 7954946
number of dimensions: 200
rdf2vec-sg-200-avgbin.txt has 380581 entities
rdf2vec-sg-200-original.txt has 380581 entities
--2023-12-07 19:46:16--  https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/classic-rdf2vec-sg-oa-200/sgpos200_classic.txt
Resolving data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)... 134.155.95.56
Connecting to data.dws.informatik.uni-mannheim.de (



number of entities: 8145384
number of dimensions: 200
rdf2vec-sg-oa-200-avgbin.txt has 370369 entities
rdf2vec-sg-oa-200-original.txt has 370369 entities
--2023-12-07 20:24:03--  https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_dbpedia_ComplEx.txt
Resolving data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)... 134.155.95.56
Connecting to data.dws.informatik.uni-mannheim.de (data.dws.informatik.uni-mannheim.de)|134.155.95.56|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19847092817 (18G) [text/plain]
Saving to: ‘non-rdf2vec-ComplEx.txt’


2023-12-07 20:38:32 (21.8 MB/s) - ‘non-rdf2vec-ComplEx.txt’ saved [19847092817/19847092817]

number of entities: 8499982
number of dimensions: 200
non-rdf2vec-ComplEx-avgbin.txt has 393699 entities
non-rdf2vec-ComplEx-original.txt has 393699 entities
--2023-12-07 21:06:42--  https://data.dws.informatik.uni-mannheim.de/kgvec2go/dbpedia/2021-09/non-rdf2vec/vectors_