In [None]:
import torch
import pandas as pd
import numpy as np
import requests
from functions import *
import gensim
import gensim.downloader as api
from pykeen.datasets import FB15k237, WN18RR

In [None]:
device="cuda"

### 0 Load and prepare dataset

In [None]:
dataset = WN18RR()  #or WN18RR()
dataset

In [None]:
# Store the entity-to-id and relation-to-id relationship in separate dictionaries
ent_to_id = dataset.entity_to_id
rel_to_id = dataset.relation_to_id

ent_id_list = list(dataset.entity_to_id.keys())
rel_id_list = list(dataset.relation_to_id.keys())

### 1 Load pretrained Word2Vec

In [None]:
w2v_vectors = gensim.downloader.load('word2vec-google-news-300')
w2v_vocab = w2v_vectors.index_to_key

### 2 FB15k-237

#### 2.1 Load entity and relation definitions

In [None]:
# Load entity to text file for labels and descriptions 
df_entity2text = pd.read_csv('data/fb15k237/fb15k237_entity2text.txt', delimiter="\t", header = None, names=["id", "entity"])
df_entity2textlong = pd.read_csv('data/fb15k237/fb15k237_entity2textlong.txt', delimiter="\t", header = None, names=["id", "entity"])

# Load relation to text file 
df_rel2text = pd.read_csv('data/fb15k237/fb15k237_relation2text.txt', delimiter="\t", header = None, names=["id", "definition"])

#### 2.2 Preprocessing

In [None]:
extra_ent = pd.read_csv('fb15k237_extralabels_ent.csv', delimiter=",", header = 0, names=["id", "entity"])

In [None]:
df_entity2text['entity'] = df_entity2text['id'].map(extra_ent.set_index('id')['entity']).fillna(df_entity2text['entity'])
df_entity2text["entity_cleaned"]=remove_characters(df_entity2text["entity"])
df_entity2text["tokenized_entities"] = df_entity2text["entity_cleaned"].str.split(' ')
df_entity2text[:3]

In [None]:
df_rel2text["cleaned_relation"] = df_rel2text["definition"].str.replace('.', ' ')
df_rel2text["tokenized_relations"] = df_rel2text["cleaned_relation"].str.split(' ')
df_rel2text[:3]

In [None]:
tok_entities = df_entity2text["tokenized_entities"]
tok_relations = df_rel2text["tokenized_relations"]

#### 2.3 Generate Word2Vec embeddings

In [None]:
entities = [tok_entities[i] for i in range(len(tok_entities))]
relations = [tok_relations[i] for i in range(len(tok_relations))]

In [None]:
rel_embeddings, exclude_rel = generate_w2v_embeddings(relations, w2v_vocab, w2v_vectors)
ent_embeddings, exclude_ent = generate_w2v_embeddings(entities, w2v_vocab, w2v_vectors)

##### Compute average Word2Vec embeddings for relations and entities

In [None]:
avg_rel = avg_w2v_embeddings(rel_embeddings)
avg_ent = avg_w2v_embeddings(ent_embeddings)

##### Sort embeddings to match order of entities and relations of pykeen datasets

In [None]:
rel_embeddings_sorted = embedding_mapping(df_rel2text, df_rel2text["id"], rel_id_list, avg_rel)
ent_embeddings_sorted = embedding_mapping(df_entity2text, df_entity2text["id"], ent_id_list, avg_ent)

### 3 WN18RR

#### 3.1 Load entity and relation definitions

In [None]:
df_entity2text = pd.read_csv('00_data/wn18rr/wn18rr_entity2text.txt', delimiter="\t", header = None, names=["id", "definition"])
df_entity2text[["entity", "description"]] = df_entity2text["definition"].str.split(',', n=1, expand=True)
df_entity2text.id = df_entity2text.id.astype(str)
df_entity2text["id"] = df_entity2text["id"].str.rjust(8, '0')
df_entity2text[:3]

In [None]:
for i in range(len(rel_id_list)):
    rel_id_list[i] = rel_id_list[i].replace("_", " ")
    rel_id_list[i] = rel_id_list[i].lstrip()
rel_id_list[:3]

#### 3.2 Preprocessing

In [None]:
# Load WN18RR extra labels for entities that do not have a match with pretrained Word2Vec embeddings 
wn18rr_extra_ent = pd.read_csv('wn18rr_extralabels_ent.csv', delimiter=",", header = 0, names=["id", "entity"])
wn18rr_extra_ent.id = wn18rr_extra_ent.id.astype(str)
wn18rr_extra_ent["id"] = wn18rr_extra_ent["id"].str.rjust(8, '0')
wn18rr_extra_ent[:2]

In [None]:
df_entity2text['entity'] = df_entity2text['id'].map(wn18rr_extra_ent.set_index('id')['entity']).fillna(df_entity2text['entity'])

In [None]:
df_entity2text["entity_cleaned"]=remove_characters(df_entity2text["entity"])
df_entity2text["tokenized_entities"] = df_entity2text["entity_cleaned"].str.split(' ')

In [None]:
tok_entities = df_entity2text["tokenized_entities"]

#### 3.3 Generate Word2Vec embeddings

In [None]:
entities = [tok_entities[i] for i in range(len(tok_entities))]
relations = [rel_id_list[i].split() for i in range(len(rel_id_list))]

In [None]:
relations[3] = ['superordinate']
relations[4][1] = 'superordinate'
relations[5] = ['member', "part", "something"] 
relations[9]= [ 'set', 'synonym', 'domain', 'topic', 'of']

In [None]:
ent_embeddings, ent_excluded = generate_w2v_embeddings(entities, w2v_vocab, w2v_vectors)

In [None]:
rel_embeddings, rel_excluded = generate_w2v_embeddings(relations, w2v_vocab, w2v_vectors)

In [None]:
avg_rel = avg_w2v_embeddings(rel_embeddings)
avg_ent = avg_w2v_embeddings(ent_embeddings)

In [None]:
rel_embeddings_sorted = embedding_mapping(df_rel2text, df_rel2text["id"], rel_id_list, avg_rel)
ent_embeddings_sorted = embedding_mapping(df_entity2text, df_entity2text["id"], ent_id_list, avg_ent)