In [1]:
import torch
from transformers import BertTokenizer, BertModel
import pandas as pd
import numpy as np
import requests
from functions import *
import gensim
import nltk
import gensim.downloader as api
from pykeen.datasets import FB15k237, WN18RR

In [2]:
device="cuda"

## 0 Load and prepare dataset

In [5]:
dataset = FB15k237()  #or WN18RR()
dataset

You're trying to map triples with 30 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 28 from 20466 triples were filtered out


FB15k237(num_entities=14505, num_relations=237, create_inverse_triples=False)

In [6]:
# Store the entity-to-id and relation-to-id relationship in separate dictionaries
ent_to_id = dataset.entity_to_id
rel_to_id = dataset.relation_to_id

ent_id_list = list(dataset.entity_to_id.keys())
rel_id_list = list(dataset.relation_to_id.keys())

### 1 FB15k-237

#### Load entity and relation definitions

In [4]:
# Load entity to text file for labels and descriptions 
df_entity2text = pd.read_csv('data/fb15k237/fb15k237_entity2text.txt', delimiter="\t", header = None, names=["id", "entity"])
df_entity2textlong = pd.read_csv('data/fb15k237/fb15k237_entity2textlong.txt', delimiter="\t", header = None, names=["id", "entity"])

# Load relation to text file 
df_rel2text = pd.read_csv('data/fb15k237/fb15k237_relation2text.txt', delimiter="\t", header = None, names=["id", "definition"])

#### 1.1 Preprocessing of dataset for WORD2VEC Embeddings

##### 1.1.1 Entities

In [44]:
char1 = [',', "'", ':', ';', "!"]
char2 = ['-', "/"]
def remove_characters(strings, char1, char2):
    result = []
    for string in strings:
        for char in char1:
            string = string.replace(char, '')
        for char in char2:
            string = string.replace(char, ' ')
        result.append(string)
    return result

In [39]:
extra_ent = pd.read_csv('fb15k237_extralabels_ent.csv', delimiter=",", header = 0, names=["id", "entity"])

In [79]:
df_entity2text['entity'] = df_entity2text['id'].map(extra_ent.set_index('id')['entity']).fillna(df_entity2text['entity'])
df_entity2text["entity_cleaned"]=remove_characters(df_entity2text["entity"], char1, char2)
df_entity2text["segmented_entities"] = df_entity2text["entity_cleaned"].str.split(' ')
df_entity2text[40:50]

Unnamed: 0,id,entity,entity_cleaned,segmented_entities
40,/m/023p18,Iowa Writers' Workshop,Iowa Writers Workshop,"[Iowa, Writers, Workshop]"
41,/m/02qwg,Eric Clapton,Eric Clapton,"[Eric, Clapton]"
42,/m/02slt7,French premium pay television channel,French premium pay television channel,"[French, premium, pay, television, channel]"
43,/m/0vg8x,Bloomfield Hills,Bloomfield Hills,"[Bloomfield, Hills]"
44,/m/04pg29,Marc Cherry,Marc Cherry,"[Marc, Cherry]"
45,/m/05hz6_,Rio Ave F.C.,Rio Ave F.C.,"[Rio, Ave, F.C.]"
46,/m/054_mz,Lawrence Bender,Lawrence Bender,"[Lawrence, Bender]"
47,/m/048tgl,Josh Freese,Josh Freese,"[Josh, Freese]"
48,/m/013t2y,Wakefield,Wakefield,[Wakefield]
49,/m/07jbh,Table tennis,Table tennis,"[Table, tennis]"


In [None]:
......

In [81]:
entities = df_entity2text["entity"]  #specify here which column of the df should be taken as an input for the BERT model 
seg_entities = df_entity2text["segmented_entities"]

##### 1.1.2 Relations

In [73]:
df_rel2text["cleaned_relation"] = df_rel2text["definition"].str.replace('.', ' ')
df_rel2text["segmented_relation"] = df_rel2text["cleaned_relation"].str.split(' ')

  df_rel2text["cleaned_relation"] = df_rel2text["definition"].str.replace('.', ' ')


In [76]:
df_rel2text[:5]

Unnamed: 0,id,definition,cleaned_relation,segmented_relation
0,/soccer/football_team/current_roster./soccer/f...,soccer football team current roster. soccer fo...,soccer football team current roster soccer fo...,"[soccer, football, team, current, roster, , so..."
1,/music/artist/origin,music artist origin,music artist origin,"[music, artist, origin]"
2,/ice_hockey/hockey_team/current_roster./sports...,ice hockey hockey team current roster. sports ...,ice hockey hockey team current roster sports ...,"[ice, hockey, hockey, team, current, roster, ,..."
3,/food/food/nutrients./food/nutrition_fact/nutr...,food food nutrients. food nutrition fact nutrient,food food nutrients food nutrition fact nutrient,"[food, food, nutrients, , food, nutrition, fac..."
4,/film/actor/film./film/performance/film,film actor film. film performance film,film actor film film performance film,"[film, actor, film, , film, performance, film]"


In [83]:
entities = df_entity2text["entity"]  #specify here which column of the df should be taken as an input for the BERT model 
seg_entities = df_entity2text["segmented_entities"]
relations = df_rel2text["definition"]
seg_relations = df_rel2text["segmented_relation"]

#### Preprocessing of datafraset for BERT Embeddings

##### 1.2.1 Entities

##### 1.2.2 Relations

###### Load entity2text file from data

In [9]:
df_entity2text = pd.read_csv('data/fb15k237/fb15k237_entity2text.txt', delimiter="\t", header = None, names=["id", "entity"])

###### Load entities that are not in word2vec from separate file

In [49]:
extra_ent = pd.read_csv('missing_w2v_entities.csv', delimiter=",", header = 0, names=["id", "entity"])

In [50]:
extra_ent

Unnamed: 0,id,entity
42,/m/02slt7,French premium pay television channel
51,/m/0q01m,amino acid
323,/m/02qm5j,number of musicians important precursors of pu...
758,/m/016g2_,bulbous flowering plants
1080,/m/01w20rx,Christian recording artist
1288,/m/0bdd_,largest most populous of the sixteen Polish pr...
1334,/m/03gt7s,fast tempo sub-genre hardcore punk
1499,/m/027g6p7,natural carotenoid pigment
1507,/m/0b13yt,cornerback sixth defensive back on defense
1666,/m/05zn92p,American digital broadcast television network


In [51]:
df_entity2text['entity'] = df_entity2text['id'].map(extra_ent.set_index('id')['entity']).fillna(df_entity2text['entity'])

In [52]:
df_entity2text["entity_cleaned"]=remove_characters(df_entity2text["entity"], char1, char2)
df_entity2text["segmented_entities"] = df_entity2text["entity_cleaned"].str.split(' ')
df_entity2text[40:50]

Unnamed: 0,id,entity,entity_cleaned,segmented_entities
40,/m/023p18,Iowa Writers' Workshop,Iowa Writers Workshop,"[Iowa, Writers, Workshop]"
41,/m/02qwg,Eric Clapton,Eric Clapton,"[Eric, Clapton]"
42,/m/02slt7,French premium pay television channel,French premium pay television channel,"[French, premium, pay, television, channel]"
43,/m/0vg8x,Bloomfield Hills,Bloomfield Hills,"[Bloomfield, Hills]"
44,/m/04pg29,Marc Cherry,Marc Cherry,"[Marc, Cherry]"
45,/m/05hz6_,Rio Ave F.C.,Rio Ave F.C.,"[Rio, Ave, F.C.]"
46,/m/054_mz,Lawrence Bender,Lawrence Bender,"[Lawrence, Bender]"
47,/m/048tgl,Josh Freese,Josh Freese,"[Josh, Freese]"
48,/m/013t2y,Wakefield,Wakefield,[Wakefield]
49,/m/07jbh,Table tennis,Table tennis,"[Table, tennis]"


###### Hide code

In [39]:
df_entity2textlong = pd.read_csv('data/fb15k237/fb15k237_entity2textlong.txt', delimiter="\t", header = None, names=["id", "entity"])

In [10]:
missing_values = df_entity2textlong[~df_entity2textlong['id'].isin(ent_id_list)]

# Print the missing values
print(missing_values)

              id                                             entity
19      /m/0lf_w  To be hanged, drawn and quartered was from 135...
77      /m/03lht  The House of Habsburg, also spelled Hapsburg, ...
105    /m/02hjn4  A single-player video game is a video game whe...
118    /m/01rk30  Doctor Ivo \"Eggman\" Robotnik, is a video gam...
126    /m/03dpl4  Over-the-air programming refers to various met...
...          ...                                                ...
14771   /m/0q4mn  Pop art is an art movement that emerged in the...
14782   /m/0dbtv  A flood is an overflow of water that submerges...
14807  /m/012h0y  Women's rights are the rights and entitlements...
14834   /m/018s4  BSD licenses are a family of permissive free s...
14877  /m/01jk9n  Pornographic films or sex films are films that...

[424 rows x 2 columns]


In [12]:
values_not_present = [value for value in ent_id_list if value not in df_entity2textlong['id'].values]

In [40]:
values_not_present

['/m/015zql',
 '/m/01dvms',
 '/m/01dy7j',
 '/m/01fkv0',
 '/m/01xsbh',
 '/m/01xzb6',
 '/m/0288crq',
 '/m/029cpw',
 '/m/02jxk',
 '/m/03m3nzf',
 '/m/04_1l0v',
 '/m/05ry0p',
 '/m/05xf75',
 '/m/05zvq6g',
 '/m/061zc_',
 '/m/07_bv_',
 '/m/07djnx',
 '/m/07t_l23',
 '/m/0854hr',
 '/m/08chdb',
 '/m/09ly2r6',
 '/m/0cfywh',
 '/m/0h005',
 '/m/0kvrb',
 '/m/0m6x4']

In [15]:
embeddings_descript = torch.load('03_nlm_embeddings/bert_fb15k237/fb15k237_avgbert4LL_extended_ent.pt', map_location = torch.device(device))

In [19]:
embeddings_bert = torch.load('03_nlm_embeddings/bert_fb15k237/avg_4lastlayers/01_bert_fb15k237_ent_sorted.pt', map_location = torch.device(device))

In [16]:
embeddings_descript = embeddings_descript.detach().numpy()

In [20]:
embeddings_bert = embeddings_bert.detach().numpy()

In [17]:
embeddings_descript[:3]

array([[-0.35380164, -0.1842202 , -0.2955732 , ..., -0.04774771,
         0.18331939, -0.6402802 ],
       [-0.11553758,  0.06239744,  0.17551416, ..., -0.29196453,
         0.0605844 , -0.9386168 ],
       [-0.4070915 , -0.3311174 , -0.6618794 , ...,  0.36608148,
         0.00133017, -0.5051978 ]], dtype=float32)

In [24]:
df_entity2textlong["bert_embeddings"]= [embeddings_descript[i] for i in range(len(embeddings_descript))]

In [26]:
df_sorted_ent = pd.DataFrame({"id": ent_id_list})

In [27]:
df_sorted_ent["bert_embeddings"] = [embeddings_bert[i] for i in range(len(embeddings_bert))]

In [30]:
df_sorted_ent[:5]

Unnamed: 0,id,bert_embeddings
0,/m/010016,"[-0.7038374, -0.42919597, 0.21363114, -0.06080..."
1,/m/0100mt,"[-0.69194925, -0.4327587, 0.38790178, 0.023704..."
2,/m/0102t4,"[-0.7149743, -0.39547038, 0.22586966, 0.026374..."
3,/m/0104lr,"[-0.75991297, -0.59261405, 0.27878103, -0.2569..."
4,/m/0105y2,"[-0.91232014, -0.32333505, 0.19027695, 0.04667..."


In [31]:
df_entity2textlong[:5]

Unnamed: 0,id,entity,bert embeddings,bert_embeddings
0,/m/06rf7,Schleswig-Holstein is the northernmost of the ...,"[-0.35380164, -0.1842202, -0.2955732, 0.168221...","[-0.35380164, -0.1842202, -0.2955732, 0.168221..."
1,/m/0c94fn,Gary Roger Rydstrom is an American sound desig...,"[-0.11553758, 0.062397435, 0.17551416, 0.21539...","[-0.11553758, 0.062397435, 0.17551416, 0.21539..."
2,/m/016ywr,Jeremy John Irons is an English actor. After r...,"[-0.4070915, -0.3311174, -0.6618794, -0.064381...","[-0.4070915, -0.3311174, -0.6618794, -0.064381..."
3,/m/01yjl,The Chicago Cubs are a professional baseball t...,"[-0.45914665, -0.2469731, -0.47041193, 0.04532...","[-0.45914665, -0.2469731, -0.47041193, 0.04532..."
4,/m/02hrh1q,An actor is a person portraying a character in...,"[-0.29197603, -0.136096, -0.060602292, 0.06338...","[-0.29197603, -0.136096, -0.060602292, 0.06338..."


In [32]:
df_sorted_ent['Matched_entities'] = df_sorted_ent['id'].map(df_entity2textlong.set_index('id')['bert_embeddings'])

In [43]:
df_sorted_ent

Unnamed: 0,id,bert_embeddings,Matched_entities
0,/m/010016,"[-0.7038374, -0.42919597, 0.21363114, -0.06080...","[-0.089965105, -0.41573668, -0.21134691, -0.12..."
1,/m/0100mt,"[-0.69194925, -0.4327587, 0.38790178, 0.023704...","[-0.2943981, -0.75384307, -0.15784611, -0.4398..."
2,/m/0102t4,"[-0.7149743, -0.39547038, 0.22586966, 0.026374...","[-0.3151846, -0.23722485, -0.32813567, 0.05474..."
3,/m/0104lr,"[-0.75991297, -0.59261405, 0.27878103, -0.2569...","[-0.35681513, -0.35246718, -0.2811796, 0.05256..."
4,/m/0105y2,"[-0.91232014, -0.32333505, 0.19027695, 0.04667...","[-0.37196738, -0.5555919, -0.08981007, -0.1476..."
...,...,...,...
14500,/m/0zqq,"[-0.47573105, -0.50945956, 0.23829491, 0.02118...","[-0.25621644, -0.11462242, -0.042842463, 0.119..."
14501,/m/0zqq8,"[-0.6550542, -0.4856075, 0.21798255, 0.1715764...","[-0.3481192, -0.16820523, -0.24632338, 0.10905..."
14502,/m/0zrlp,"[-0.76009, -0.5027063, 0.354324, -0.15966484, ...","[-0.30763152, -0.09521635, -0.19763705, 0.0187..."
14503,/m/0zygc,"[-0.67723703, -0.3711557, 0.2578664, -0.077872...","[-0.21036395, -0.14281178, -0.02849355, 0.1069..."


In [44]:
df_sorted_ent['Matched_entities'].fillna(df_sorted_ent['bert_embeddings'], inplace=True)

In [47]:
embeddings = list(df_sorted_ent['Matched_entities'].values)

In [50]:
embeddings = torch.tensor(embeddings)

  embeddings = torch.tensor(embeddings)


In [57]:
torch.save(embeddings, "03_nlm_embeddings/bert_fb15k237/avg_4lastlayers/02_bert_fb15k237_descript_ent.pt")

In [54]:
len(embeddings)

14505

In [9]:
len(df_entity2textlong)

14904

In [53]:
entities = df_entity2text["entity"]  #specify here which column of the df should be taken as an input for the BERT model 
seg_entities = df_entity2text["segmented_entities"] #specify here which column of the df should be taken as a tokenized input for static LMs

##### Relations

In [9]:
df_rel2text = pd.read_csv('data/fb15k237/fb15k237_relation2text.txt', delimiter="\t", header = None, names=["id", "definition"])
df_rel2text[["property_1_id", "property_2_id"]] = df_rel2text["id"].str.split('.', n=1, expand=True)
df_rel2text["property_1_id"] = df_rel2text["property_1_id"].str.replace("/", ", ").str[2:]
df_rel2text["property_2_id"] = df_rel2text["property_2_id"].str.replace("/", ", ").str[2:]
df_rel2text["property_1_id"] = df_rel2text["property_1_id"].str.replace("_", " ")
df_rel2text["property_2_id"] = df_rel2text["property_2_id"].str.replace("_", " ")
df_rel2text[:4]

Unnamed: 0,id,definition,property_1_id,property_2_id
0,/soccer/football_team/current_roster./soccer/f...,soccer football team current roster. soccer fo...,"soccer, football team, current roster","soccer, football roster position, position"
1,/music/artist/origin,music artist origin,"music, artist, origin",
2,/ice_hockey/hockey_team/current_roster./sports...,ice hockey hockey team current roster. sports ...,"ice hockey, hockey team, current roster","sports, sports team roster, position"
3,/food/food/nutrients./food/nutrition_fact/nutr...,food food nutrients. food nutrition fact nutrient,"food, food, nutrients","food, nutrition fact, nutrient"


In [10]:
df_rel2text.shape

(237, 4)

In [11]:
df_rel2text["segmented"] = df_rel2text["property_2_id"].str.split(',')
df_rel2text

Unnamed: 0,id,definition,property_1_id,property_2_id,segmented
0,/soccer/football_team/current_roster./soccer/f...,soccer football team current roster. soccer fo...,"soccer, football team, current roster","soccer, football roster position, position","[soccer, football roster position, position]"
1,/music/artist/origin,music artist origin,"music, artist, origin",,
2,/ice_hockey/hockey_team/current_roster./sports...,ice hockey hockey team current roster. sports ...,"ice hockey, hockey team, current roster","sports, sports team roster, position","[sports, sports team roster, position]"
3,/food/food/nutrients./food/nutrition_fact/nutr...,food food nutrients. food nutrition fact nutrient,"food, food, nutrients","food, nutrition fact, nutrient","[food, nutrition fact, nutrient]"
4,/film/actor/film./film/performance/film,film actor film. film performance film,"film, actor, film","film, performance, film","[film, performance, film]"
...,...,...,...,...,...
232,/base/biblioness/bibs_location/country,base biblioness bibs location country,"base, biblioness, bibs location, country",,
233,/user/ktrueman/default_domain/international_or...,user ktrueman default domain international org...,"user, ktrueman, default domain, international ...",,
234,/music/performance_role/track_performances./mu...,music performance role track performances. mus...,"music, performance role, track performances","music, track contribution, role","[music, track contribution, role]"
235,/olympics/olympic_games/medals_awarded./olympi...,olympics olympic games medals awarded. olympic...,"olympics, olympic games, medals awarded","olympics, olympic medal honor, medal","[olympics, olympic medal honor, medal]"


In [12]:
df_rel2text.loc[df_rel2text["segmented"].isna(), "segmented"] = df_rel2text.loc[df_rel2text["segmented"].isna(), "property_1_id"].str.split(',')

In [14]:
df_rel2text[:5]

Unnamed: 0,id,definition,property_1_id,property_2_id,segmented
0,/soccer/football_team/current_roster./soccer/f...,soccer football team current roster. soccer fo...,"soccer, football team, current roster","soccer, football roster position, position","[soccer, football roster position, position]"
1,/music/artist/origin,music artist origin,"music, artist, origin",,"[music, artist, origin]"
2,/ice_hockey/hockey_team/current_roster./sports...,ice hockey hockey team current roster. sports ...,"ice hockey, hockey team, current roster","sports, sports team roster, position","[sports, sports team roster, position]"
3,/food/food/nutrients./food/nutrition_fact/nutr...,food food nutrients. food nutrition fact nutrient,"food, food, nutrients","food, nutrition fact, nutrient","[food, nutrition fact, nutrient]"
4,/film/actor/film./film/performance/film,film actor film. film performance film,"film, actor, film","film, performance, film","[film, performance, film]"


In [None]:
def test(lst):
    return lst[-2:]

In [None]:
df_rel2text['segmented_reduced'] = df_rel2text['segmented'].apply(test)

In [None]:
df_rel2text[:4]

In [None]:
# Store each entity in separate list, then store all lists into a list → needed for word2vec input
row_list = []
for rows in df_rel2text.itertuples():
    #my_list = [rows.definition] #Create list for the current row
    row_list.append(rows.segmented_reduced) #append the list to the final list
input_word2vec = row_list

In [None]:
input_word2vec[:5]

In [None]:
# Compute average word embeddings for each KG relation
averaged_embeddings = []
for idx in range(len(df_rel2text)):
    entity = df_rel2text["segmented_reduced"].loc[idx]
    seg_entity = df_rel2text["segmented_entities"].loc[idx]
    lst = []
    if entity in wv_dict:
        print("if", idx, entity)
        averaged_embeddings.append(wv_dict[entity])
    else:
        print("else", idx, entity)
        for item in seg_entity:
            lst.append(wv_dict[item])
        avg_embdd = np.mean(lst, axis=0)
        averaged_embeddings.append(avg_embdd)

##### Restructure

In [None]:
df_entity2text = pd.read_csv('data/fb15k237/fb15k237_entity2text.txt', delimiter="\t", header = None, names=["id", "entity"])
df_entity2text["segmented_entities"] = df_entity2text["entity"].str.split(' ')

In [None]:
df_entity2text[:5]

In [None]:
# Store each entity in separate list, then store all lists into a list → needed for word2vec input
row_list = []
for rows in df_entity2text.itertuples():
    #my_list = [rows.definition] #Create list for the current row
    row_list.append(rows.segmented_entities) #append the list to the final list
input_word2vec = row_list

In [None]:
input_word2vec[:5]

#### 1.2 WN18RR

##### 1.2.1 Load dataset from pykeen and get entity and relation keys

In [7]:
from pykeen.datasets import WN18RR
dataset=WN18RR()

In [8]:
# Store the entity-to-id and relation-to-id relationship in separate dictionaries
ent_to_id = dataset.entity_to_id
rel_to_id = dataset.relation_to_id

You're trying to map triples with 212 entities and 0 relations that are not in the training set. These triples will be excluded from the mapping.
In total 210 from 3134 triples were filtered out


In [9]:
ent_id_list = list(dataset.entity_to_id.keys())
rel_id_list = list(dataset.relation_to_id.keys())

In [93]:
len(ent_id_list)

40559

##### 1.3.2 Load entity and relation definitions

In [36]:
df_entity2text = pd.read_csv('data/wn18rr/wn18rr_entity2text.txt', delimiter="\t", header = None, names=["id", "definition"])
df_entity2text[["entity", "description"]] = df_entity2text["definition"].str.split(',', n=1, expand=True)
df_entity2text.id = df_entity2text.id.astype(str)
df_entity2text["id"] = df_entity2text["id"].str.rjust(8, '0')
df_entity2text[:1]

Unnamed: 0,id,definition,entity,description
0,14854262,"stool, solid excretory product evacuated from ...",stool,solid excretory product evacuated from the bo...


In [37]:
wn18rr_extra_ent = pd.read_csv('wn18rr_ent_excluded.csv', delimiter=",", header = 0, names=["id", "entity"])
wn18rr_extra_ent.id = wn18rr_extra_ent.id.astype(str)
wn18rr_extra_ent["id"] = wn18rr_extra_ent["id"].str.rjust(8, '0')
wn18rr_extra_ent[:1]

Unnamed: 0,id,entity
105,1543272,a small songbird


In [38]:
df_entity2text['entity'] = df_entity2text['id'].map(wn18rr_extra_ent.set_index('id')['entity']).fillna(df_entity2text['entity'])

In [39]:
df_entity2text[:1]

Unnamed: 0,id,definition,entity,description
0,14854262,"stool, solid excretory product evacuated from ...",stool,solid excretory product evacuated from the bo...


In [68]:
df_entity2text["entity_cleaned"]=remove_characters(df_entity2text["entity"], char1, char2)
df_entity2text["segmented_entities"] = df_entity2text["entity_cleaned"].str.split(' ')

In [85]:
len(df_entity2text)

40943

In [44]:
for i in range(len(rel_id_list)):
    rel_id_list[i] = rel_id_list[i].replace("_", " ")
    rel_id_list[i] = rel_id_list[i].lstrip()

In [45]:
rel_id_list

['also see',
 'derivationally related form',
 'has part',
 'hypernym',
 'instance hypernym',
 'member meronym',
 'member of domain region',
 'member of domain usage',
 'similar to',
 'synset domain topic of',
 'verb group']

In [70]:
entities = df_entity2text["definition"]  #specify here which column of the df should be taken as an input for the BERT model 
seg_entities = df_entity2text["segmented_entities"] #specify here which column of the df should be taken as a tokenized input for static LMs

In [73]:
len(seg_entities)

40943

### 2. Generate BERT embeddings 

##### 2.1 Set parameters

In [85]:
#output_path = #'03_nlm_embeddings/bert_fb15k237/4LL_extended/'
#filename = #'_bert4LL_extended_fb15k237_ent.pt'
tokenizer, model = load_bert(device)
ent_list = entities.to_list()
rel_list = relations.to_list()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [86]:
ent_list[:3]

['Schleswig-Holstein', 'Gary Rydstrom', 'Jeremy Irons']

##### Tokenize entities and relations

In [117]:
ent_to_tokens = tokenize(tokenizer, ent_list)

In [118]:
rel_to_tokens = tokenize(tokenizer, rel_list)

##### Load entities in batches and feed to BERT LM

In [119]:
cd = customdata(ent_to_tokens)
dataloader = torch.utils.data.DataLoader(cd, batch_size = 50, shuffle = False, num_workers = 0)

In [12]:
# Use function to generate BERT embeddings for 
batch_embedd(dataloader, model, output_path, filename, device)



Memory Usage in Iteration: 1 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 2 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 3 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 4 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 5 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 6 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 7 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 8 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 9 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 10 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 11 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 12 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 13 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 14 | Allocated: 22.0 GB | Cached:  22.2 GB
Memory Usage in Iteration: 15

##### Generate word embeddings for relations

In [40]:
input_to_tokens = tokenizer(rel_list, padding=True, truncation=True, return_tensors='pt')['input_ids']
input_to_tokens = input_to_tokens.to(device)

In [41]:
input_to_tokens.device

device(type='cuda', index=0)

In [42]:
output = generate_bert_embeddings(input_to_tokens, model)
#torch.save(output, "nlm_embeddings/bert___")

In [44]:
len(output[0])

NameError: name 'output' is not defined

In [50]:
#torch.save(output, "03_nlm_embeddings/bert_fb15k237/4lastlayers/00_bert_4lastlayers_fb15k237_rel.pt")

##### 2.2 Compute average of 4LL of BERT embeddings 

In [10]:
def avg_bert4LL_embeddings(embeddings):    
    #Average the last 4 hidden layers for token 0 = CLS token, cls_embdd = hidden_states[:][:][0]
    embedd_averaged = []

    # For each text sequence...
    for index in range(len(embeddings[0])):
        # Sum the vectors from the last four layers. Each layer vector is 768 values, so `sum_vec` is length 768.
        tensor = torch.stack((embeddings[-1][index][0], embeddings[-2][index][0],
                             embeddings[-3][index][0], embeddings[-4][index][0]), dim=0)
        avg_vec = torch.mean(tensor, dim=0)
        embedd_averaged.append(avg_vec)

    # Convert output to 2D tensor
    ent_embedd_avg = torch.stack(embedd_averaged, 0)
    del tensor
    del avg_vec
    del embedd_averaged

# ------

In [9]:
ent_embedd_avg = torch.load('03_nlm_embeddings/bert_fb15k237/fb15k237_avgbert4LL_extended_ent.pt', map_location = torch.device(device))

In [14]:
len(ent_embedd_avg)

14904

In [28]:
entity_mapping = {}
entity_tensor = ent_embedd_avg
for item in ent_id_list:
    # Get index of item in df entity2text
    idx = df_entity2text[df_entity2text["id"] == item].index.item()

    # Add tensor to dictionary, whereby key is the entity id 
    entity_mapping[item] = ent_embedd_avg[idx]
    
to_list = list(entity_mapping.values())
entity_embedd = torch.stack(to_list, 0)

/m/010016 -----
            id                                             entity  \
521  /m/010016  Denton is a city in the U.S. state of Texas an...   

                                    segmented_entities  
521  [Denton, is, a, city, in, the, U.S., state, of...  
Int64Index([521], dtype='int64')
521
/m/0100mt -----
              id                                             entity  \
10219  /m/0100mt  El Paso is the county seat of El Paso County, ...   

                                      segmented_entities  
10219  [El, Paso, is, the, county, seat, of, El, Paso...  
Int64Index([10219], dtype='int64')
10219
/m/0102t4 -----
              id                                             entity  \
12309  /m/0102t4  Marshall is a city in and the county seat of H...   

                                      segmented_entities  
12309  [Marshall, is, a, city, in, and, the, county, ...  
Int64Index([12309], dtype='int64')
12309
/m/0104lr -----
             id                           

ValueError: can only convert an array of size 1 to a Python scalar

In [16]:
len(ent_id_list)

14505

In [43]:
df_entity2text[df_entity2text["id"] == '/m/010016'].index[0]

521

In [17]:
entity_embedd.shape

torch.Size([14505, 768])

In [60]:
#torch.save(ent_embedd_avg, "03_nlm_embeddings/bert_fb15k237/avg_4lastlayers/01_bert_fb15k237_rel_sorted.pt")

## ___________________________________________________________________________________________________

### 3. Static LM: word embeddings

#### 3.1 Word2Vec Model

##### Set parameters 

In [None]:
#Set these values to respective dataframe column of target dataset
#entityid_col =  
#seg_entityid_col

In [95]:
#input_word2vec_ent = [seg_entities[i] for i in range(len(seg_entities))]
input_word2vec_rel = [rel_id_list[i].split() for i in range(len(rel_id_list))]

In [122]:
input_word2vec_rel

[['also', 'see'],
 ['derivationally', 'related', 'form'],
 ['has', 'part'],
 ['superordinate'],
 ['instance', 'superordinate'],
 ['member', 'part', 'something'],
 ['member', 'of', 'domain', 'region'],
 ['member', 'of', 'domain', 'usage'],
 ['similar', 'to'],
 ['set', 'synonym', 'domain', 'topic', 'of'],
 ['verb', 'group']]

In [121]:
input_word2vec_rel[3] = ['superordinate']
input_word2vec_rel[4][1] = 'superordinate'
input_word2vec_rel[5] = ['member', "part", "something"] 
input_word2vec_rel[9]= [ 'set', 'synonym', 'domain', 'topic', 'of']

##### Prepare relations

In [15]:
updated_rel = [string.replace('/', ' ') for string in rel_id_list]
updated_rel[:3] 

[' american_football football_team current_roster. sports sports_team_roster position',
 ' award award_category category_of',
 ' award award_category disciplines_or_subjects']

In [18]:
updated_rel = [string.lstrip() for string in updated_rel]
updated_rel[:3] 

['american_football football_team current_roster. sports sports_team_roster position',
 'award award_category category_of',
 'award award_category disciplines_or_subjects']

In [19]:
updated_rel = [string.replace('_', ' ') for string in updated_rel]
updated_rel[:3] 

['american football football team current roster. sports sports team roster position',
 'award award category category of',
 'award award category disciplines or subjects']

In [20]:
updated_rel = [string.replace('.', ' ') for string in updated_rel]
updated_rel[:3] 

['american football football team current roster  sports sports team roster position',
 'award award category category of',
 'award award category disciplines or subjects']

In [21]:
updated_rel[200:237]

['soccer football team current roster  sports sports team roster position',
 'sports pro athlete teams  sports sports team roster team',
 'sports professional sports team draft picks  sports sports league draft pick draft',
 'sports professional sports team draft picks  sports sports league draft pick school',
 'sports sport pro athletes  sports pro sports played athlete',
 'sports sports league teams  sports sports league participation team',
 'sports sports league draft picks  sports sports league draft pick school',
 'sports sports position players  american football football historical roster position position s',
 'sports sports position players  sports sports team roster position',
 'sports sports position players  sports sports team roster team',
 'sports sports team colors',
 'sports sports team roster  american football football historical roster position position s',
 'sports sports team roster  american football football roster position position',
 'sports sports team roster

In [21]:
#input_word2vec_rel = [rel_id_list[i].split() for i in range(len(rel_id_list))]

In [22]:
input_word2vec_rel = [updated_rel[i].split() for i in range(len(rel_id_list))]

In [23]:
input_word2vec_rel[200:237]

[['soccer',
  'football',
  'team',
  'current',
  'roster',
  'sports',
  'sports',
  'team',
  'roster',
  'position'],
 ['sports',
  'pro',
  'athlete',
  'teams',
  'sports',
  'sports',
  'team',
  'roster',
  'team'],
 ['sports',
  'professional',
  'sports',
  'team',
  'draft',
  'picks',
  'sports',
  'sports',
  'league',
  'draft',
  'pick',
  'draft'],
 ['sports',
  'professional',
  'sports',
  'team',
  'draft',
  'picks',
  'sports',
  'sports',
  'league',
  'draft',
  'pick',
  'school'],
 ['sports',
  'sport',
  'pro',
  'athletes',
  'sports',
  'pro',
  'sports',
  'played',
  'athlete'],
 ['sports',
  'sports',
  'league',
  'teams',
  'sports',
  'sports',
  'league',
  'participation',
  'team'],
 ['sports',
  'sports',
  'league',
  'draft',
  'picks',
  'sports',
  'sports',
  'league',
  'draft',
  'pick',
  'school'],
 ['sports',
  'sports',
  'position',
  'players',
  'american',
  'football',
  'football',
  'historical',
  'roster',
  'position',
  'posit

In [25]:
wv_input_rel = [list(set(my_list)) for my_list in input_word2vec_rel]

In [26]:
wv_input_rel[:4]

[['american', 'position', 'team', 'current', 'sports', 'football', 'roster'],
 ['category', 'award', 'of'],
 ['award', 'or', 'category', 'disciplines', 'subjects'],
 ['award', 'nominated', 'category', 'nomination', 'for', 'nominees']]

##### Prepare entities 

In [18]:
input_word2vec_ent[10:20]

[['Focus', 'Features'],
 ['Henry', 'V'],
 ['Himesh', 'Reshammiya'],
 ['Maggie', 'Gyllenhaal'],
 ['Iron'],
 ['Brompton', 'Cemetery'],
 ['University', 'of', 'California', 'Irvine'],
 ['AACTA',
  'Award',
  'for',
  'Best',
  'Guest',
  'or',
  'Supporting',
  'Actor',
  'in',
  'a',
  'Television',
  'Drama'],
 ['Hunter', 'S.', 'Thompson'],
 ['Hanged', 'drawn', 'and', 'quartered']]

##### Retrieve Word2Vec Embeddings

In [3]:
word2vec_vectors = gensim.downloader.load('word2vec-google-news-300')

In [4]:
w2v_vocab = word2vec_vectors.index_to_key

In [7]:
word2vec_vectors["jeremy
        "]

SyntaxError: EOL while scanning string literal (2875207538.py, line 1)

In [123]:
wv_input = input_word2vec_rel
len(wv_input)

11

In [75]:
wv_input[2217]

['colonizer']

In [124]:
word2vec_embeddings = []
exclude = []
for array in wv_input: 
    embeddings=[]
    for word in array:
        if word not in w2v_vocab:
            exclude.append(word)
        else:
            vw_embedding = word2vec_vectors[word]
            embeddings.append(vw_embedding)
    word2vec_embeddings.append(embeddings)    

In [125]:
exclude

['derivationally', 'of', 'of', 'to', 'of']

In [34]:
rel_id_list

['/american_football/football_team/current_roster./sports/sports_team_roster/position',
 '/award/award_category/category_of',
 '/award/award_category/disciplines_or_subjects',
 '/award/award_category/nominees./award/award_nomination/nominated_for',
 '/award/award_category/winners./award/award_honor/award_winner',
 '/award/award_category/winners./award/award_honor/ceremony',
 '/award/award_ceremony/awards_presented./award/award_honor/award_winner',
 '/award/award_ceremony/awards_presented./award/award_honor/honored_for',
 '/award/award_nominated_work/award_nominations./award/award_nomination/nominated_for',
 '/award/award_nominee/award_nominations./award/award_nomination/award',
 '/award/award_nominee/award_nominations./award/award_nomination/award_nominee',
 '/award/award_nominee/award_nominations./award/award_nomination/nominated_for',
 '/award/award_winner/awards_won./award/award_honor/award_winner',
 '/award/award_winning_work/awards_won./award/award_honor/award',
 '/award/award_win

In [126]:
averaged_embeddings = []
for idx in range(len(word2vec_embeddings)):
    array = word2vec_embeddings[idx]
    avg_embedd = np.mean(array, axis=0)
    averaged_embeddings.append(avg_embedd)
embeddings = torch.tensor(np.array(averaged_embeddings))  

In [128]:
embeddings.shape

torch.Size([11, 300])

In [1]:
torch.chunk(embeddings, 2, dim=1)

NameError: name 'torch' is not defined

In [136]:
len(test[0][0])

150

In [79]:
index_list = []
for idx in range(len(averaged_embeddings)):
    if type(averaged_embeddings[idx])==np.ndarray:
        next
    else:
        print(type(averaged_embeddings[idx]), idx)
        index_list.append(idx)

In [80]:
to_exclude_ent = [entities[i] for i in index_list]
print(len(to_exclude_ent))
filtered_df = df_entity2text[df_entity2text['definition'].isin(to_exclude_ent)]

# Retrieve the corresponding values from column A
excluded_ent = filtered_df['id'].tolist()

0


In [63]:
to_exclude_ent

['saxony, an area in Germany around the upper Elbe river; the original home of the Saxons',
 'ondatra zibethica, beaver-like aquatic rodent of North America with dark glossy brown fur',
 'mandragora, a genus of stemless herbs of the family Solanaceae',
 'dispraise, the act of speaking contemptuously of',
 'placodermi, extinct group of bony-plated fishes with primitive jaws',
 'rhincodontidae, small-toothed sharks comprising only one species',
 'taurotragus, African antelopes: elands',
 'otariidae, eared seals: sea lions and fur seals',
 'preachification, moralization delivered tediously in a preachy manner',
 'adequateness, the quality of being able to meet a need satisfactorily: "he questioned the adequacy of the usual sentimental interpretation of the Golden Rule"',
 'freewheeler, someone acting freely or even irresponsibly',
 'sundacarpus, one species',
 'pyrrhula, bullfinches',
 'slezsko, a region of central Europe rich in deposits of coal and iron ore; annexed by Prussia in 1742 b

In [64]:
excluded_ent

['08769179',
 '02338145',
 '12906334',
 '01219893',
 '01479937',
 '01487743',
 '02426634',
 '02076535',
 '06743230',
 '04792357',
 '10109662',
 '11659500',
 '01534034',
 '09166534',
 '05064722',
 '03912664',
 '05998356',
 '06439712',
 '01591490',
 '02049299',
 '02432704',
 '08970445',
 '02483564',
 '02639786',
 '08722844',
 '12349916',
 '02250653',
 '05545047',
 '04904664',
 '11348812',
 '09700823',
 '10237069',
 '12061614',
 '09903501',
 '03912328',
 '00589217',
 '12078954',
 '09365288',
 '09545000',
 '06350127',
 '09476521',
 '08720280',
 '01679254',
 '00291004',
 '01421164',
 '02258065',
 '01954516',
 '13742840',
 '08821187',
 '01454702',
 '02978205',
 '11643506',
 '01570969',
 '08728882',
 '01064696',
 '08510456',
 '02056873',
 '13829047',
 '08778597',
 '06609785',
 '10365984',
 '12572188',
 '01566645',
 '02622408',
 '02683419',
 '11641963',
 '13486115',
 '12957467',
 '10541229',
 '02353529',
 '10352299',
 '00944449',
 '03747508',
 '11178161',
 '00502085',
 '00527367',
 '03830111',

In [136]:
wn18rr_ent_excluded = df_entity2text[df_entity2text["id"].isin(excluded_ent)]
wn18rr_ent_excluded = wn18rr_ent_excluded[["id", "definition"]]

In [138]:
wn18rr_ent_excluded.to_csv("wn18rr_ent_excluded.csv")

In [65]:
df_entity2text['descript_short'] = df_entity2text['description'].str.split().str[:5].str.join(' ')

In [66]:
df_entity2text[:5]

Unnamed: 0,id,definition,entity,description,entity_cleaned,segmented_entities,descript_short
0,14854262,"stool, solid excretory product evacuated from ...",stool,solid excretory product evacuated from the bo...,stool,[stool],solid excretory product evacuated from
1,590383,"chieftainship, the position of chieftain",chieftainship,the position of chieftain,chieftainship,[chieftainship],the position of chieftain
2,8769179,"saxony, an area in Germany around the upper El...",saxony,an area in Germany around the upper Elbe rive...,saxony,[saxony],an area in Germany around
3,2338145,"ondatra zibethica, beaver-like aquatic rodent ...",ondatra zibethica,beaver-like aquatic rodent of North America w...,ondatra zibethica,"[ondatra, zibethica]",beaver-like aquatic rodent of North
4,1990168,"founder, sink below the surface",founder,sink below the surface,founder,[founder],sink below the surface


In [67]:
df_entity2text.loc[df_entity2text['id'].isin(excluded_ent), 'entity'] = df_entity2text.loc[df_entity2text['id'].isin(excluded_ent), 'descript_short']    

In [34]:
averaged_embeddings

[array([ 0.10449219,  0.03057861, -0.13064575, -0.10180664, -0.1665039 ,
        -0.01174927, -0.04595947, -0.12854004,  0.0546875 ,  0.01757812,
         0.22607422, -0.04278564, -0.25732422,  0.05273438,  0.12597656,
         0.0078125 , -0.20751953,  0.5292969 , -0.02093506, -0.23535156,
         0.07348633,  0.02392578, -0.15722656,  0.26904297,  0.29345703,
         0.12084961, -0.15454102,  0.21044922,  0.09484863,  0.09234619,
         0.16421509, -0.19970703,  0.09472656,  0.07159424, -0.1352539 ,
        -0.12475586,  0.23876953,  0.17285156,  0.23860168,  0.31591797,
         0.03663635,  0.01696777, -0.51953125, -0.00292969, -0.20996094,
        -0.12109375,  0.19726562,  0.09472656, -0.20605469,  0.10333252,
        -0.22094727, -0.20654297,  0.21923828,  0.08447266, -0.29907227,
         0.06762695, -0.2581787 , -0.01696777, -0.18554688, -0.25146484,
        -0.09509277, -0.22200775, -0.14942932, -0.06091309, -0.09082031,
        -0.2919922 , -0.01513672,  0.1381836 , -0.3

In [64]:
embeddings.shape

torch.Size([14951, 300])

In [47]:
#torch.save(embeddings, "03_nlm_embeddings/word2vec_fb15k237/03_word2vec_wn18rr_300dim_ent_sorted.pt")

###### ------------------------------------

In [18]:
ent_wv = word2vec_embeddings(input_word2vec_ent, em_dim=200, wd=5) #whereby wd → window 
rel_wv = word2vec_embeddings(input_word2vec_rel, em_dim=200, wd=5)

In [13]:
ent_embeddings = avg_w2v_embeddings(entities, seg_entities, ent_wv[1])
#rel_embeddings = avg_w2v_embeddings(entityid_col, seg_entityid_col, rel_wv[1])

if 0 stool
if 1 chieftainship
if 2 saxony
else 3 ondatra zibethica
if 4 founder
else 5 rose campion
if 6 enjoyment
else 7 german shepherd dog
if 8 quilt
else 9 wake island
if 10 raise
if 11 admiralty
if 12 opener
if 13 watchman
if 14 reducing
if 15 twaddle
if 16 yodel
else 17 apostolic delegate
if 18 exhilaration
if 19 latticework
if 20 lifer
if 21 mandragora
if 22 bit
if 23 dispraise
else 24 genus bletia
else 25 silicon valley
if 26 rush
if 27 placodermi
if 28 idolization
if 29 rhincodontidae
if 30 taurotragus
else 31 saddam's martyrs
if 32 otariidae
if 33 preachification
else 34 vena peroneus
if 35 webb
if 36 sledgehammer
if 37 shipyard
if 38 adequateness
else 39 third estate
if 40 redoubt
if 41 freewheeler
else 42 union representative
if 43 sundacarpus
else 44 baffle board
if 45 squat
if 46 jacaranda
else 47 genus cheiranthus
if 48 supposition
else 49 st. lucia
if 50 wind
if 51 pyrrhula
if 52 slezsko
if 53 ramble
if 54 lobularity
else 55 family bacteroidaceae
else 56 leaf beetle
els

In [None]:
entityid_col=
seg_entityid_col=

In [19]:
averaged_embeddings = []
for idx in range(len(entities)):
    entity = entities.loc[idx]
    seg_entity = seg_entities.loc[idx]
    lst = []
    if entity in ent_wv[1]:
        print("if", idx, entity)
        averaged_embeddings.append(ent_wv[1][entity])
    else:
        print("else", idx, entity)
        for item in seg_entity:
            lst.append(ent_wv[1][item])
        avg_embdd = np.mean(lst, axis=0)
        averaged_embeddings.append(avg_embdd)

embeddings = torch.tensor(np.array(averaged_embeddings))  

if 0 stool
if 1 chieftainship
if 2 saxony
else 3 ondatra zibethica
if 4 founder
else 5 rose campion
if 6 enjoyment
else 7 german shepherd dog
if 8 quilt
else 9 wake island
if 10 raise
if 11 admiralty
if 12 opener
if 13 watchman
if 14 reducing
if 15 twaddle
if 16 yodel
else 17 apostolic delegate
if 18 exhilaration
if 19 latticework
if 20 lifer
if 21 mandragora
if 22 bit
if 23 dispraise
else 24 genus bletia
else 25 silicon valley
if 26 rush
if 27 placodermi
if 28 idolization
if 29 rhincodontidae
if 30 taurotragus
else 31 saddam's martyrs
if 32 otariidae
if 33 preachification
else 34 vena peroneus
if 35 webb
if 36 sledgehammer
if 37 shipyard
if 38 adequateness
else 39 third estate
if 40 redoubt
if 41 freewheeler
else 42 union representative
if 43 sundacarpus
else 44 baffle board
if 45 squat
if 46 jacaranda
else 47 genus cheiranthus
if 48 supposition
else 49 st. lucia
if 50 wind
if 51 pyrrhula
if 52 slezsko
if 53 ramble
if 54 lobularity
else 55 family bacteroidaceae
else 56 leaf beetle
els

In [21]:
rel_wv[1]

{'domain': array([-8.37855041e-04,  3.69424000e-04,  7.97398388e-03,  1.40769891e-02,
        -1.45358592e-02, -1.11200139e-02,  1.00919884e-02,  1.40202940e-02,
        -7.83660635e-03, -5.88026829e-03,  1.15320385e-02, -2.39604898e-03,
        -7.08845817e-03,  1.02407057e-02, -7.59400055e-03, -2.83752754e-03,
         4.49465588e-03,  1.54980272e-03, -1.29456483e-02, -1.47637781e-02,
         1.14246346e-02,  7.92228431e-03,  1.05588958e-02,  1.19197741e-03,
         9.92326625e-03, -5.32088429e-03, -1.47875212e-03,  9.01339576e-03,
        -1.17525589e-02, -6.15016185e-03, -1.17368475e-02, -1.45319104e-03,
         1.49033107e-02, -1.14361979e-02, -3.64651345e-03, -3.02772038e-03,
         1.26209948e-02, -9.26702470e-03,  7.05663115e-05, -7.42770918e-03,
        -1.50055476e-02,  7.82389566e-03, -1.36868525e-02, -6.86222687e-03,
        -5.48437238e-05, -4.62783501e-04, -1.19706877e-02,  1.50230359e-02,
         7.78446533e-03,  1.44267865e-02, -1.27467457e-02,  7.02468492e-03,
  

In [22]:
averaged_embeddings = []
for idx in range(len(rel_id_list)):
    entity = rel_id_list[idx]
    seg_entity = input_word2vec_rel[idx]
    lst = []
    if entity in rel_wv[1]:
        print("if", idx, entity)
        averaged_embeddings.append(rel_wv[1][entity])
    else:
        print("else", idx, entity)
        for item in seg_entity:
            lst.append(rel_wv[1][item])
        avg_embdd = np.mean(lst, axis=0)
        averaged_embeddings.append(avg_embdd)

embeddings = torch.tensor(np.array(averaged_embeddings))  

else 0 also see
else 1 derivationally related form
else 2 has part
if 3 hypernym
else 4 instance hypernym
else 5 member meronym
else 6 member of domain region
else 7 member of domain usage
else 8 similar to
else 9 synset domain topic of
else 10 verb group


In [35]:
ent_id_list

['/m/010016',
 '/m/0100mt',
 '/m/0102t4',
 '/m/0104lr',
 '/m/0105y2',
 '/m/0106dv',
 '/m/0108xl',
 '/m/0109vk',
 '/m/010bnr',
 '/m/010bxh',
 '/m/010cw1',
 '/m/010dft',
 '/m/010h9y',
 '/m/010hn',
 '/m/010m55',
 '/m/010nlt',
 '/m/010p3',
 '/m/010r6f',
 '/m/010rvx',
 '/m/010t4v',
 '/m/010tkc',
 '/m/010v8k',
 '/m/010xjr',
 '/m/010y34',
 '/m/010z5n',
 '/m/0113sg',
 '/m/0114m0',
 '/m/0118d3',
 '/m/011_3s',
 '/m/011_6p',
 '/m/011_vz',
 '/m/011hdn',
 '/m/011hq1',
 '/m/011j5x',
 '/m/011k11',
 '/m/011k1h',
 '/m/011k4g',
 '/m/011k_j',
 '/m/011kn2',
 '/m/011lpr',
 '/m/011lvx',
 '/m/011pcj',
 '/m/011s0',
 '/m/011s9r',
 '/m/011v3',
 '/m/011vx3',
 '/m/011w20',
 '/m/011w4n',
 '/m/011w54',
 '/m/011wdm',
 '/m/011wtv',
 '/m/011x_4',
 '/m/011xg5',
 '/m/011xhx',
 '/m/011xjd',
 '/m/011xy1',
 '/m/011ycb',
 '/m/011yd2',
 '/m/011ydl',
 '/m/011yfd',
 '/m/011yg9',
 '/m/011yhm',
 '/m/011ykb',
 '/m/011yl_',
 '/m/011yn5',
 '/m/011yph',
 '/m/011ypx',
 '/m/011yqc',
 '/m/011yr9',
 '/m/011yrp',
 '/m/011ys5',
 '/m/011ys

##### BERT 2 last layers

In [7]:
# Load raw embeddings 
bert_rel = torch.load('03_nlm_embeddings/bert_fb15k237/4lastlayers/00_bert_4lastlayers_fb15k237_rel.pt', map_location = torch.device(device))
# Store 2 second last layers separately
bert_rel_re = bert_rel[-1]
bert_rel_im = bert_rel[-2]

In [8]:
bert_ent = load_ent_embeddings('03_nlm_embeddings/bert_fb15k237/4lastlayers', device)
bert_ent_concat = concat_ent_embeddings(bert_ent)
bert_ent_re = bert_ent_concat[-1]
bert_ent_im = bert_ent_concat[-2]

['00_bert_4lastlayers_fb15k237_rel.pt', '01_bert_4lastlayers_fb15k237_ent.pt', '02_bert_4lastlayers_fb15k237_ent.pt', '03_bert_4lastlayers_fb15k237_ent.pt', '04_bert_4lastlayers_fb15k237_ent.pt', '05_bert_4lastlayers_fb15k237_ent.pt', '06_bert_4lastlayers_fb15k237_ent.pt']


In [15]:
bert_rel_re.shape

torch.Size([237, 40, 768])

In [11]:
bert_rel_re_sorted =  kg_mapping(rel_id_list, df_rel2text, bert_rel[-1])
bert_rel_im_sorted =  kg_mapping(rel_id_list, df_rel2text, bert_rel[-2])
bert_ent_re_sorted =  kg_mapping(ent_id_list, df_entity2text, bert_ent_concat[-1])
bert_ent_im_sorted =  kg_mapping(ent_id_list, df_entity2text, bert_ent_concat[-2])

In [16]:
bert_ent_re_sorted.shape

torch.Size([14505, 20, 768])

###### ---------------------

In [9]:
def kg_mapping(id_list, df, embeddings):
    entity_mapping = {}
    for item in id_list:
        # Get index of item in df entity2text
        idx = df[df["id"] == item].index.item()

        # Add tensor to dictionary, whereby key is the entity id 
        entity_mapping[item] = embeddings[idx] #ent_embedd_raw[idx]

    to_list = list(entity_mapping.values())
    embedd = torch.stack(to_list, 0)
    return embedd

In [92]:
entity_embedd.shape

torch.Size([40559, 300])

In [94]:
torch.save(entity_embedd, "03_nlm_embeddings/word2vec_wn18rr/05_word2vec_wn18rr_300dim_ent_sorted.pt")

##### Clean dataset (fb15k237)

In [65]:
df_entity2textlong[df_entity2textlong['id'].isin(excluded_ent)].to_csv('missing_w2v_entities.csv')

#### old version

In [None]:
# Word2Vec model
w2v_cbow = gensim.models.Word2Vec(input_word2vec, min_count=1, vector_size=256, window=5, sg=1)
word_vectors = w2v_cbow.wv.vectors  # Retrieve word vectors of type numpy array
wv_keys = list(w2v_cbow.wv.index_to_key)  # Retrieve keys to word vectors
wv_dict = res = {wv_keys[i]: word_vectors[i] for i in range(len(wv_keys))}  # Save word vectors with respective key in dictionary

In [None]:
# Compute average word embeddings for each KG relation
averaged_embeddings = []
for idx in range(len(df_entity2text)):
    entity = df_entity2text["entity"].loc[idx]
    seg_entity = df_entity2text["segmented_entities"].loc[idx]
    lst = []
    if entity in wv_dict:
        print("if", idx, entity)
        averaged_embeddings.append(wv_dict[entity])
    else:
        print("else", idx, entity)
        for item in seg_entity:
            lst.append(wv_dict[item])
        avg_embdd = np.mean(lst, axis=0)
        averaged_embeddings.append(avg_embdd)

In [None]:
len(averaged_embeddings)

In [None]:
#mapping = wv_dict
#embeddings = torch.from_numpy(word_vectors) #Convert to tensor to use as input for KGE

#df_relations["averaged embeddings"] = averaged_embeddings
embeddings = torch.tensor(np.array(averaged_embeddings))  # Convert to tensor to use as input for KGE
#mapping = df_relations

In [None]:
len(embeddings)

In [None]:
torch.save(embeddings, "03_nlm_embeddings/word2vec_fb15k237/01_word2vec_fb15k237_ent.pt")