In [2]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import requests
from functions import *

In [3]:
import pykeen

In [4]:
use_cuda = torch.cuda.is_available()
use_cuda

True

#### Dataset exploring

##### FB15k-237

In [5]:
from pykeen.datasets import FB15k237

In [30]:
data = FB15k237()
data

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out


FB15k237(num_entities=14505, num_relations=237, create_inverse_triples=False)

In [7]:
# Store the entity-to-id and relation-to-id relationship in separate dictionaries
ent_to_id = data.entity_to_id
rel_to_id = data.relation_to_id

In [11]:
df_entity2text = pd.read_csv('data/fb15k237_entity2text.txt', delimiter="\t", header = None, names=["id", "entity"])
df_entity2text[:5]

Unnamed: 0,id,entity
0,/m/06rf7,Schleswig-Holstein
1,/m/0c94fn,Gary Rydstrom
2,/m/016ywr,Jeremy Irons
3,/m/01yjl,Chicago Cubs
4,/m/02hrh1q,Actor-GB


In [16]:
df_entity2text.shape

(14951, 2)

#### Generate BERT embeddings 

In [10]:
# Load the BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True)
model = model.to("cuda:0")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
df_entity2text = pd.read_csv('data/wn18rr_entity2text.txt', delimiter="\t", header = None, names=["id", "definition"])
df_entity2text[["entity", "description"]] = df_entity2text["definition"].str.split(',', n=1, expand=True)
df_entity2text[:5]

In [12]:
# Tokenize the KG entities
input_to_tokens = tokenizer(df_entity2text["entity"].to_list(), padding=True, return_tensors='pt')['input_ids']
input_to_tokens.device

device(type='cpu')

In [19]:
input_to_tokens

tensor([[  101, 21173,  1011,  ...,     0,     0,     0],
        [  101,  5639, 29431,  ...,     0,     0,     0],
        [  101,  7441,  3707,  ...,     0,     0,     0],
        ...,
        [  101,  5696,  7842,  ...,     0,     0,     0],
        [  101, 13081,  2118,  ...,     0,     0,     0],
        [  101, 12300,   102,  ...,     0,     0,     0]])

In [23]:
tokenizer.convert_ids_to_tokens(input_to_tokens[3])

['[CLS]',
 'chicago',
 'cubs',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [24]:
cd = customdata(input_to_tokens)
dataloader = torch.utils.data.DataLoader(cd, batch_size = 2500, shuffle = False, num_workers = 0)

#### Generate BERT embeddings for entities in batches 

In [25]:
iter = 1
embeddings = []
for data in dataloader:
    data = data.to("cuda:0")
    output = generate_bert_embeddings(data,model)
    output = torch.stack(output, 0)# output is originally tuple → stack BERT hidden layers (13) tensors into 1 tensor    

    if iter < 10:
        batch_nr = "0" + str(iter)
    else:
        batch_nr = str(iter)
    torch.save(output, "nlm_embeddings/bert_fb15k237/" + batch_nr + "_bert_4lastlayers_fb15k237_ent.pt")
    #embeddings.append(output)
    
    
    
    #Additional Info when using cuda
    print('Memory Usage in Iteration:', iter,
          '| Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB |', 
          'Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    
    iter = iter + 1
    del data
    del output
    torch.cuda.empty_cache()



Memory Usage in Iteration: 1 | Allocated: 29.1 GB | Cached:  29.7 GB
Memory Usage in Iteration: 2 | Allocated: 29.1 GB | Cached:  29.7 GB
Memory Usage in Iteration: 3 | Allocated: 29.1 GB | Cached:  29.7 GB
Memory Usage in Iteration: 4 | Allocated: 29.1 GB | Cached:  29.7 GB
Memory Usage in Iteration: 5 | Allocated: 29.1 GB | Cached:  29.7 GB
Memory Usage in Iteration: 6 | Allocated: 28.6 GB | Cached:  29.0 GB


In [28]:
torch.cuda.empty_cache()

In [27]:
print('| Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB |', 
          'Cached: ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

| Allocated: 0.4 GB | Cached:  0.5 GB


In [None]:
#torch.save(embeddings,'/nlm_embeddings/bert_wn18rr/bert_4lastlayers_wn18rr_ent.pt')

#### Generate BERT embeddings for relations

In [31]:
#from pykeen.datasets import WN18RR
#dataset = WN18RR()
rel_to_list = list(data.relation_to_id)
len(rel_to_list)

237

In [32]:
rel_to_list

['/american_football/football_team/current_roster./sports/sports_team_roster/position',
 '/award/award_category/category_of',
 '/award/award_category/disciplines_or_subjects',
 '/award/award_category/nominees./award/award_nomination/nominated_for',
 '/award/award_category/winners./award/award_honor/award_winner',
 '/award/award_category/winners./award/award_honor/ceremony',
 '/award/award_ceremony/awards_presented./award/award_honor/award_winner',
 '/award/award_ceremony/awards_presented./award/award_honor/honored_for',
 '/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for',
 '/award/award_nominee/award_nominations./award/award_nomination/award',
 '/award/award_nominee/award_nominations./award/award_nomination/award_nominee',
 '/award/award_nominee/award_nominations./award/award_nomination/nominated_for',
 '/award/award_winner/awards_won./award/award_honor/award_winner',
 '/award/award_winning_work/awards_won./award/award_honor/award',
 '/award/award_win

##### Clean for tokenization

In [None]:
for i in range(len(rel_to_list)):
    rel_to_list[i] = rel_to_list[i].replace("_", " ")
    rel_to_list[i] = rel_to_list[i].lstrip()

In [None]:
# Tokenize the KG relations
rel_to_tokens = tokenizer(rel_to_list, padding=True, return_tensors='pt')['input_ids']
rel_to_tokens.device

In [None]:
rel_to_tokens = rel_to_tokens.to("cuda:0")
output = generate_bert_embeddings(rel_to_tokens,model)

In [None]:
torch.save(output, "nlm_embeddings/bert_wn18rr/00_bert_4lastlayers_wn18rr_rel.pt")