In [1]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import requests
from functions import *
import gensim
import nltk
import gensim.downloader as api
from pykeen.datasets import FB15k237, WN18RR

In [2]:
device="cuda"

## 0 Load and prepare dataset

In [27]:
dataset = FB15k237()  #or WN18RR()
dataset

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out


FB15k237(num_entities=14505, num_relations=237, create_inverse_triples=False)

In [28]:
# Store the entity-to-id and relation-to-id relationship in separate dictionaries
ent_to_id = dataset.entity_to_id
rel_to_id = dataset.relation_to_id

ent_id_list = list(dataset.entity_to_id.keys())
rel_id_list = list(dataset.relation_to_id.keys())

### 1 FB15k-237

#### Load entity and relation definitions

In [29]:
# Load entity to text file for labels and descriptions 
df_entity2text = pd.read_csv('data/fb15k237/fb15k237_entity2text.txt', delimiter="\t", header = None, names=["id", "entity"])
df_entity2textlong = pd.read_csv('data/fb15k237/fb15k237_entity2textlong.txt', delimiter="\t", header = None, names=["id", "entity"])

# Load relation to text file 
df_rel2text = pd.read_csv('data/fb15k237/fb15k237_relation2text.txt', delimiter="\t", header = None, names=["id", "definition"])

In [32]:
df_rel2text

Unnamed: 0,id,definition
0,/soccer/football_team/current_roster./soccer/f...,soccer football team current roster. soccer fo...
1,/music/artist/origin,music artist origin
2,/ice_hockey/hockey_team/current_roster./sports...,ice hockey hockey team current roster. sports ...
3,/food/food/nutrients./food/nutrition_fact/nutr...,food food nutrients. food nutrition fact nutrient
4,/film/actor/film./film/performance/film,film actor film. film performance film
...,...,...
232,/base/biblioness/bibs_location/country,base biblioness bibs location country
233,/user/ktrueman/default_domain/international_or...,user ktrueman default domain international org...
234,/music/performance_role/track_performances./mu...,music performance role track performances. mus...
235,/olympics/olympic_games/medals_awarded./olympi...,olympics olympic games medals awarded. olympic...


#### 1.2 Preprocessing of dataset for BERT Embeddings

In [36]:
ent_list = df_entity2text["entity"].to_list()  #specify here which column of the df should be taken as an input for the BERT model 
rel_list = df_rel2text["definition"].to_list() #specify here which column of the df should be taken as a tokenized input for static LMs

### 2 WN18RR

#### Load entity and relation definitions
→ Separate entity name from entity description into 2 different columns 
→ Turn "id" into n 8-digit number by adding 0 at the beginning - Necessary to match the id coming from Pykeen's triple factory  

In [5]:
df_entity2text = pd.read_csv('data/wn18rr/wn18rr_entity2text.txt', delimiter="\t", header = None, names=["id", "definition"])
df_entity2text[["entity", "description"]] = df_entity2text["definition"].str.split(',', n=1, expand=True)
df_entity2text.id = df_entity2text.id.astype(str)
df_entity2text["id"] = df_entity2text["id"].str.rjust(8, '0')
df_entity2text[:3]

Unnamed: 0,id,definition,entity,description
0,14854262,"stool, solid excretory product evacuated from ...",stool,solid excretory product evacuated from the bo...
1,590383,"chieftainship, the position of chieftain",chieftainship,the position of chieftain
2,8769179,"saxony, an area in Germany around the upper El...",saxony,an area in Germany around the upper Elbe rive...


In [6]:
for i in range(len(rel_id_list)):
    rel_id_list[i] = rel_id_list[i].replace("_", " ")
    rel_id_list[i] = rel_id_list[i].lstrip()
rel_id_list[:3]

['also see', 'derivationally related form', 'has part']

#### 2.1 Specify input for BERT model
→ entities: select one of the options based on the desired BERT embedding strategy: labels only vs. labels & descriptions
→ relations: are already saved as a list

In [7]:
ent_list = df_entity2text["entity"].to_list()  # to produce BERT embedding for only or entity labels  
#tok_entities = df_entity2text["tokenized_entities"].to_list() # to produce BERT embedding for entity labels and descriptions
rel_list = rel_id_list

### 2. Generate BERT embeddings 

##### 2.1 Set parameters

In [9]:
#output_path = #'03_nlm_embeddings/bert_fb15k237/4LL_extended/'
#filename = #'_bert4LL_extended_fb15k237_ent.pt'
tokenizer, model = load_bert(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


##### Tokenize entities and relations

In [39]:
tokenized_ent = tokenize(tokenizer, ent_list)

In [10]:
tokenized_ent

{'input_ids': tensor([[  101, 14708,   102,  ...,     0,     0,     0],
        [  101, 26625,  9650,  ...,     0,     0,     0],
        [  101, 13019,   102,  ...,     0,     0,     0],
        ...,
        [  101,  7166,  2239,  ...,     0,     0,     0],
        [  101,  3562, 20934,  ...,     0,     0,     0],
        [  101,  3502,   102,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

##### Load entities in batches and feed to BERT LM

In [41]:
cd = customdata(tokenized_ent["input_ids"], tokenized_ent["attention_mask"])
dataloader = torch.utils.data.DataLoader(cd, batch_size = 50, shuffle = False, num_workers = 0)

In [12]:
# Use function to generate BERT embeddings for entities
batch_embedd(dataloader, model, output_path, filename, device)



Memory Usage in Iteration: 1 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 2 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 3 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 4 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 5 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 6 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 7 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 8 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 9 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 10 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 11 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 12 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 13 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 14 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 15

##### Generate word embeddings for relations

In [30]:
tokenized_rel = tokenize(tokenizer, rel_list)

In [42]:
output = generate_bert_embeddings(tokenized_rel, model)
#torch.save(output, "nlm_embeddings/bert___")

##### 2.2 Compute average BERT embeddings 

##### Entities

In [14]:
path = "03_nlm_embeddings/bert_wn18rr/all_hiddenlayers"

In [15]:
# Load entity embeddings: these are saved in batches in separate files → output shape (nr_batches, 4, nr_samples_batch, nr_tokens bert_dimension)
ent_embedd_raw = load_ent_embeddings(path, device)

# Concatenate embeddings to remove batch separation → output shape (4, nr_samples, nr_tokens, bert_dimension) 
concat_ent_embedd = concat_ent_embeddings(ent_embedd_raw)

# Average the 4 last hidden layers for CLS token → output shape (nr_samples, bert_dimension)
avg_ent_embedd = avg_bert4LL_embeddings(concat_ent_embedd)

# Sort entity embeddings to match the dataset sequence of pykeen
sorted_ent_embedd = embedding_mapping(df_entity2text, df_entity2text['id'], ent_id_list, avg_ent_embedd)

['.ipynb_checkpoints', '00_bert_4lastlayers_wn18rr_rel.pt', '01_bert_4lastlayers_wn18rr_ent.pt', '02_bert_4lastlayers_wn18rr_ent.pt', '03_bert_4lastlayers_wn18rr_ent.pt', '04_bert_4lastlayers_wn18rr_ent.pt', '05_bert_4lastlayers_wn18rr_ent.pt', '06_bert_4lastlayers_wn18rr_ent.pt', '07_bert_4lastlayers_wn18rr_ent.pt', '08_bert_4lastlayers_wn18rr_ent.pt', '09_bert_4lastlayers_wn18rr_ent.pt', '10_bert_4lastlayers_wn18rr_ent.pt', '11_bert_4lastlayers_wn18rr_ent.pt', '12_bert_4lastlayers_wn18rr_ent.pt', '13_bert_4lastlayers_wn18rr_ent.pt', '14_bert_4lastlayers_wn18rr_ent.pt', '15_bert_4lastlayers_wn18rr_ent.pt', '16_bert_4lastlayers_wn18rr_ent.pt', '17_bert_4lastlayers_wn18rr_ent.pt']


In [None]:
# Save BERT entity embeddings e.g. "fb15k237_bert_ent_embeddings_sorted.pt"
#torch.save(sorted_ent_embedd, "03_nlm_embeddings/bert____")

In [16]:
sorted_ent_embedd.shape

torch.Size([40559, 768])

##### Relations

In [20]:
# Load relation embeddings
rel_embedd_raw = torch.load("03_nlm_embeddings/bert_wn18rr/all_hiddenlayers/00_bert_4lastlayers_wn18rr_rel.pt",  map_location = torch.device(device))

# Average the 4 last hidden layers for CLS token → output shape (nr_samples, bert_dimension)
avg_rel_embedd = avg_bert4LL_embeddings(rel_embedd_raw)

# For Wn18RR relations are already sorted as they are extracted directly from pykeen's datasets 
# For Fb15k-237, embeddding mapping needs to be done
sorted_rel_embedd = embedding_mapping(df_rel2text, df_rel2text['id'], rel_id_list, avg_rel_embedd)

In [25]:
sorted_rel_embedd .shape

torch.Size([11, 768])

In [55]:
# Save BERT entity embeddings e.g. "fb15k237_bert_rel_embeddings_sorted.pt"
#torch.save(avg_rel_embedd, "03_nlm_embeddings/bert____")