## Code from OntoZSL (TransE+Textual embeddings)

### Imports and paths initialization

In [6]:
# -*- coding: utf-8 -*-
import os
import re
import codecs
import argparse
import pickle as pkl
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset


def parse_args(args=None):
    parser = argparse.ArgumentParser(
        description='Data pre-process',
        usage=''
    )
    parser.add_argument('--data_dir', type=str, default='data')
    parser.add_argument('--onto_dir', type=str, default='ontology')
    parser.add_argument('--glove_dir', type=str, default='glove')
    parser.add_argument('--struct_embeds_fn', default='Onto_TransE.pkl', help='')
    parser.add_argument('--word_embeds_fn', default='Onto_Text_Embed.pkl', help='')
    parser.add_argument('--triples_fn', default='triples_names_htr.txt', help='')
    parser.add_argument('--entities_fn', default='entities.dict', help='')
    parser.add_argument('--entities_names_fn', default='entities_names.dict', help='')
    parser.add_argument('--entities_embed_fn', default='entity_500.npy', help='entities embedding filename')
    parser.add_argument('--relations_embed_fn', default='relation_500.npy', help='relations embedding filename')
    parser.add_argument('--struct_embed_size', default=100, type=int, help='entity structural embeddings size')
    parser.add_argument('--text_embed_size', default=300, type=int, help='entity textual embeddings size')
    parser.add_argument('--mapping_size', default=100, type=int, help='hidden layer size')
    parser.add_argument('--dropout_ratio', default=0.5, type=float, help='')
    parser.add_argument('--margin', default=10, type=int, help='')
    parser.add_argument('--training_epochs', default=10, type=int, help='')
    parser.add_argument('--batch_size', default=100, type=int, help='')
    parser.add_argument('--display_loss_step', default=1000, type=int, help='')
    parser.add_argument('--initial_learning_rate', default=0.001, type=float, help='')
    parser.add_argument('--activation_function', default='', help='')
    parser.add_argument('--warm_up_steps', default=0, type=int)

    return parser.parse_args(args)


# set the path to your data folders here
param = parse_args(args=['--data_dir', '../persistent/data',
                         '--entities_embed_fn', 'entity_500.npy',
                         '--relations_embed_fn', 'relation_500.npy'])
onto_dir_path = os.path.join(param.data_dir, param.onto_dir)

### Combining the entities and relations embeddings into one

In [8]:
def loadDict(file_name):
    entities = list()
    wnids = open(file_name, 'r')
    try:
        for line in wnids:
            line = line[:-1]
            index, cls = line.split('\t')
            entities.append(cls)
    finally:
        wnids.close()
    print(len(entities))
    return entities


entity_file = os.path.join(onto_dir_path, 'entities_names.dict')
relation_file = os.path.join(onto_dir_path, 'relations.dict')


# load entity dict
entities = loadDict(entity_file)
relations = loadDict(relation_file)

embed_dir = os.path.join(onto_dir_path, 'save_onto_embeds')

ent_embed_file = os.path.join(embed_dir, param.entities_embed_fn)
rel_embed_file = os.path.join(embed_dir, param.relations_embed_fn)

ent_embeds = np.load(ent_embed_file)
print(ent_embeds.shape)

rel_embeds = np.load(rel_embed_file)
print(rel_embeds.shape)

embed_dict = dict()
for i, ent in enumerate(entities):
    embed_dict[ent] = ent_embeds[i].astype('float32')
for i, rel in enumerate(relations):
    embed_dict[rel] = rel_embeds[i].astype('float32')

print(len(embed_dict.keys()))


embeddings_path = os.path.join(onto_dir_path, 'embeddings')
if not os.path.exists(embeddings_path):
    os.makedirs(embeddings_path)

with open(os.path.join(onto_dir_path, 'embeddings', 'Onto_TransE.pkl'), 'wb') as f:
    pkl.dump(embed_dict, f)


124749
5
(124749, 100)
(5, 100)
124597


### Getting the textual embedding for each entity

In [3]:
def get_embedding(entity_str, word_vectors, get_vector):
    try:
        feat = get_vector(word_vectors, entity_str)
        return feat
    except:
        feat = np.zeros(WORD_VEC_LEN)

    str_set = filter(None, re.split("[ \-_]+", entity_str))
    str_set = list(str_set)
    cnt_word = 0
    for i in range(len(str_set)):
        temp_str = str_set[i]
        try:
            now_feat = get_vector(word_vectors, temp_str)
            feat = feat + now_feat
            cnt_word = cnt_word + 1
        except:
            continue

    if cnt_word > 0:
        feat = feat / cnt_word
    return feat


def generate_text_embedding(ent2doc):

    # all_feats = list()

    has = 0
    cnt_missed = 0
    missed_list = []
    entities2vec = dict()
    for ent, doc in ent2doc.items():
        feat = np.zeros(shape=WORD_VEC_LEN, dtype='float32')

        doc = doc.replace('_', ' ')
        doc = doc.replace('-', ' ')
        # print(doc)

        options = doc.split()
        cnt_word = 0

        for option in options:
            now_feat = get_embedding(option.strip(), word_vectors, get_vector)
            if np.abs(now_feat.sum()) > 0:
                cnt_word += 1
                feat += now_feat
        if cnt_word > 0:
            feat = feat / cnt_word

        # if cnt_word != len(options):
            # print(ent, 'count:', cnt_word)

        if np.abs(feat.sum()) == 0:
            # print('cannot find word ' + class_name)
            cnt_missed = cnt_missed + 1
            missed_list.append(ent + "###" + doc)
            feat = feat

        else:
            has += 1
            feat = feat / (np.linalg.norm(feat) + 1e-6)

        # all_feats.append(feat)
            entities2vec[ent] = feat

    # all_feats = np.array(all_feats)
    # print(all_feats.shape)
    # for each in missed_list:
        # print(each)
    print('does not have semantic embedding: ', cnt_missed, 'has: ', has)

    return entities2vec
    # entities2vec = dict()
    # for i, ent in enumerate(ent_list):
    #     entities2vec[ent] = ent_matrix[i]
    #     # print(ent_matrix[i])
    #
    # return entities2vec


def glove_google(word_vectors, word):
    return word_vectors[word]


def get_glove_dict(txt_dir):
    print('load glove word embedding')
    txt_file = os.path.join(txt_dir, 'glove.6B.300d.txt')
    word_dict = {}
    feat = np.zeros(WORD_VEC_LEN)
    with open(txt_file) as fp:
        for line in fp:
            words = line.split()
            assert len(words) - 1 == WORD_VEC_LEN
            for i in range(WORD_VEC_LEN):
                feat[i] = float(words[i+1])
            feat = np.array(feat)
            word_dict[words[0]] = feat
    print('loaded to dict!')
    return word_dict


def readTxt(file_name):
    class_list = list()
    wnids = open(file_name, 'r')
    try:
        for line in wnids:
            line = line[:-1]
            class_list.append(line)
    finally:
        wnids.close()
    print(len(class_list))
    return class_list


def loadDict(file_name):
    entities = list()
    wnids = open(file_name, 'r')
    try:
        for line in wnids:
            line = line[:-1]
            index, cls = line.split('\t')
            entities.append(cls)
    finally:
        wnids.close()
    print(len(entities))
    return entities


def load_domain_range(triples_file):
    text_file = codecs.open(triples_file, "r", "utf-8")
    lines = text_file.readlines()
    triples = list()
    for line in lines:
        line_arr = line.rstrip("\r\n").split("\t")
        head = line_arr[0]
        rel = line_arr[1]
        tail = line_arr[2]
        triples.append((head, rel, tail))
    return triples


WORD_VEC_LEN = 300
word_vectors = get_glove_dict(os.path.join(param.data_dir, param.glove_dir))
get_vector = glove_google

entity_text_file = os.path.join(onto_dir_path, 'entities_names.dict')
entity_file = os.path.join(onto_dir_path, 'entities.dict')
triples_file = os.path.join(onto_dir_path, 'triples_names.txt')

entities = loadDict(entity_file)
entities_names = loadDict(entity_text_file)

ent2doc = dict()
with open(entity_text_file) as f_doc:
    lines = f_doc.readlines()

    for i in range(len(lines)):
        entity_text = lines[i].strip().split('\t')[1].strip()
        ent2doc[entity_text] = entity_text


entities2vec = generate_text_embedding(ent2doc)
print(len(entities2vec.keys()))


with open(os.path.join(onto_dir_path, 'embeddings', 'Onto_Text_Embed.pkl'), 'wb') as f:
    pkl.dump(entities2vec, f)


load glove word embedding
loaded to dict!
124749
124749
does not have semantic embedding:  761 has:  123831
123831


### Processing triples (hrt to htr)

In [28]:
triples_file = os.path.join(onto_dir_path, 'triples_names.txt')
save_file = os.path.join(onto_dir_path, 'triples_names_htr.txt')

wr_fp = open(save_file, 'w')
text_file = codecs.open(triples_file, "r", "utf-8")
lines = text_file.readlines()
for line in lines:
    line_arr = line.rstrip("\r\n").split("\t")
    head = line_arr[0]
    rel = line_arr[1]
    tail = line_arr[2]

    wr_fp.write('%s\t%s\t%s\n' % (head, tail, rel))

wr_fp.close()

print(f'Wrote (h, t, r) triples to {save_file}')

Wrote (h, t, r) triples to ../persistent/data/ontology/triples_names_htr.txt


### Model training for text-aware embedding (PyTorch)

#### Model and DataLoader

In [49]:
class TrainDataSet(Dataset):

    def __init__(self, data_dir, onto_dir, struct_embeds_fn, word_embeds_fn, triples_fn, entities_fn, entities_names_fn):
        data_path = os.path.join(data_dir, onto_dir)

        with open(os.path.join(data_path, 'embeddings', struct_embeds_fn), 'rb') as f:
            self.struct_embeds = pkl.load(f)
        with open(os.path.join(data_path, 'embeddings', word_embeds_fn), 'rb') as f:
            self.word_embeds = pkl.load(f)

        self.triples = []
        with open(os.path.join(data_path, triples_fn)) as f:
            for line in f:
                h, t, r = line.strip().split('\t')
                if h in self.struct_embeds and h in self.word_embeds and \
                        t in self.struct_embeds and t in self.word_embeds:
                    # filtering triples missing an embedding
                    self.triples.append((h, t, r))
        self.triples_set = set(self.triples)

        """self.entities = []
        with open(os.path.join(data_path, entities_fn)) as f:
            for line in f:
                eid, e = line.strip().split('\t')
                if e in self.struct_embeds and e in self.word_embeds:
                    # filtering entities missing an embedding
                    self.entities.append(e)"""

        self.entities_names = []
        with open(os.path.join(data_path, entities_names_fn)) as f:
            for line in f:
                eid, e = line.strip().split('\t')
                if e in self.struct_embeds and e in self.word_embeds:
                    self.entities_names.append(e)

        print(f'Number of triples: {len(self.triples)}\nNumber of entities_names: {len(self.entities_names)}\n')

        self.transform = torch.tensor

    def __len__(self):
        return len(self.triples)

    def __getitem__(self, idx):
        pos_triple = self.triples[idx]
        neg_triple_head = self.sample_negative_head(pos_triple)
        neg_triple_tail = self.sample_negative_tail(pos_triple)

        return (self.transform(self.struct_embeds[pos_triple[2]]),
                self.transform(self.struct_embeds[pos_triple[0]]),
                self.transform(self.struct_embeds[pos_triple[1]]),
                self.transform(self.word_embeds[pos_triple[0]]),
                self.transform(self.word_embeds[pos_triple[1]]),
                self.transform(self.struct_embeds[neg_triple_head[0]]),
                self.transform(self.struct_embeds[neg_triple_tail[1]]),
                self.transform(self.word_embeds[neg_triple_head[0]]),
                self.transform(self.word_embeds[neg_triple_tail[1]]))

    def sample_negative_head(self, triple_to_corrupt):
        for i in range(len(self.entities_names)):
            index = np.random.randint(0, len(self.entities_names))
            random_entity = self.entities_names[index]
            if random_entity != triple_to_corrupt[0]:
                negative_triple = (random_entity, triple_to_corrupt[1], triple_to_corrupt[2])
                if negative_triple not in self.triples:
                    return negative_triple
        negative_triple = (triple_to_corrupt[1], triple_to_corrupt[1], triple_to_corrupt[2])
        return negative_triple

    def sample_negative_tail(self, triple_to_corrupt):
        for i in range(len(self.entities_names)):
            index = np.random.randint(0, len(self.entities_names))
            random_entity = self.entities_names[index]
            if random_entity != triple_to_corrupt[1]:
                negative_triple = (triple_to_corrupt[0], random_entity, triple_to_corrupt[2])
                if negative_triple not in self.triples:
                    return negative_triple
        negative_triple = (triple_to_corrupt[0], triple_to_corrupt[0], triple_to_corrupt[2])
        return negative_triple


class EmbeddingModel(nn.Module):

    def __init__(self, struct_embed_size, text_embed_size, dropout_rate=0.5, out_size=100):
        super(EmbeddingModel, self).__init__()

        self.rel = nn.Linear(struct_embed_size, out_size)
        self.fc_head_struct_pos = nn.Linear(struct_embed_size, out_size)
        self.fc_head_struct_neg = nn.Linear(struct_embed_size, out_size)
        self.fc_head_text_pos = nn.Linear(text_embed_size, out_size)
        self.fc_head_text_neg = nn.Linear(text_embed_size, out_size)
        self.fc_tail_struct_pos = nn.Linear(struct_embed_size, out_size)
        self.fc_tail_struct_neg = nn.Linear(struct_embed_size, out_size)
        self.fc_tail_text_pos = nn.Linear(text_embed_size, out_size)
        self.fc_tail_text_neg = nn.Linear(text_embed_size, out_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, data):
        rel, head_struct_pos, tail_struct_pos, head_text_pos, tail_text_pos, \
             head_struct_neg, tail_struct_neg, head_text_neg, tail_text_neg = data
        rel = self.dropout(F.relu(self.rel(rel)))
        hsp = self.dropout(F.relu(self.fc_head_struct_pos(head_struct_pos)))
        htp = self.dropout(F.relu(self.fc_head_text_pos(head_text_pos)))
        hsn = self.dropout(F.relu(self.fc_head_struct_neg(head_struct_neg)))
        htn = self.dropout(F.relu(self.fc_head_text_neg(head_text_neg)))
        tsp = self.dropout(F.relu(self.fc_tail_struct_pos(tail_struct_pos)))
        ttp = self.dropout(F.relu(self.fc_tail_text_pos(tail_text_pos)))
        tsn = self.dropout(F.relu(self.fc_tail_struct_neg(tail_struct_neg)))
        ttn = self.dropout(F.relu(self.fc_tail_text_neg(tail_text_neg)))

        return (rel, hsp, tsp, htp, ttp, hsn, tsn, htn, ttn)

    def score(self, data):
        rel, head_struct_pos, tail_struct_pos, head_text_pos, tail_text_pos, \
             head_struct_neg, tail_struct_neg, head_text_neg, tail_text_neg = data

        # head
        head_fs_pos = torch.sum(abs(head_struct_pos + rel - tail_struct_pos), 1, keepdim=True)
        head_fs_neg = torch.sum(abs(head_struct_pos + rel - tail_struct_neg), 1, keepdim=True)

        head_fts_pos = torch.sum(abs(head_text_pos + rel - tail_struct_pos), 1, keepdim=True)
        head_fts_neg = torch.sum(abs(head_text_pos + rel - tail_struct_neg), 1, keepdim=True)

        head_fst_pos = torch.sum(abs(head_struct_pos + rel - tail_text_pos), 1, keepdim=True)
        head_fst_neg = torch.sum(abs(head_struct_pos + rel - tail_text_neg), 1, keepdim=True)

        head_ft_pos = torch.sum(abs(head_text_pos + rel - tail_text_pos), 1, keepdim=True)
        head_ft_neg = torch.sum(abs(head_text_pos + rel - tail_text_neg), 1, keepdim=True)

        head_fadd_pos = torch.sum(abs((head_struct_pos + head_text_pos)
                                      + rel - (tail_struct_pos + tail_text_pos)), 1, keepdim=True)
        head_fadd_neg = torch.sum(abs((head_struct_pos + head_text_pos)
                                      + rel - (tail_struct_neg + tail_text_neg)), 1, keepdim=True)

        head_ftotal_pos = torch.sum(torch.cat([head_fs_pos, head_fts_pos, head_fst_pos,
                                     head_ft_pos, head_fadd_pos]), 0, keepdim=False)
        head_ftotal_neg = torch.sum(torch.cat([head_fs_neg, head_fts_neg, head_fst_neg,
                                     head_ft_neg, head_fadd_neg]), 0, keepdim=False)

        # tail
        tail_fs_pos = torch.sum(abs(tail_struct_pos - rel - head_struct_pos), 1, keepdim=True)
        tail_fs_neg = torch.sum(abs(tail_struct_pos - rel - head_struct_neg), 1, keepdim=True)

        tail_fts_pos = torch.sum(abs(tail_text_pos - rel - head_struct_pos), 1, keepdim=True)
        tail_fts_neg = torch.sum(abs(tail_text_pos - rel - head_struct_neg), 1, keepdim=True)

        tail_fst_pos = torch.sum(abs(tail_struct_pos - rel - head_text_pos), 1, keepdim=True)
        tail_fst_neg = torch.sum(abs(tail_struct_pos - rel - head_text_neg), 1, keepdim=True)

        tail_ft_pos = torch.sum(abs(tail_text_pos - rel - head_text_pos), 1, keepdim=True)
        tail_ft_neg = torch.sum(abs(tail_text_pos - rel - head_text_neg), 1, keepdim=True)

        tail_fadd_pos = torch.sum(abs((tail_struct_pos + tail_text_pos)
                                      - rel - (head_struct_pos + head_text_pos)), 1, keepdim=True)
        tail_fadd_neg = torch.sum(abs((tail_struct_pos + tail_text_pos)
                                      - rel - (head_struct_neg + head_text_neg)), 1, keepdim=True)

        tail_ftotal_pos = torch.sum(torch.cat([tail_fs_pos, tail_fts_pos, tail_fst_pos,
                                     tail_ft_pos, tail_fadd_pos]), 0, keepdim=False)
        tail_ftotal_neg = torch.sum(torch.cat([tail_fs_neg, tail_fts_neg, tail_fst_neg,
                                     tail_ft_neg, tail_fadd_neg]), 0, keepdim=False)

        return head_ftotal_pos, head_ftotal_neg, tail_ftotal_pos, tail_ftotal_neg

    def loss(self, margin, data):
        # rel, hsp, tsp, htp, ttp, hsn, tsn, htn, ttn = data
        score_head_pos, score_head_neg, score_tail_pos, score_tail_neg = self.score(data)
        print(f'Score head pos: {score_head_pos}\nScore head neg: {score_head_neg}\nScore tail pos: {score_tail_pos}\nScore tail neg:{score_tail_neg}')
        print(f'loss_1: {margin - score_head_neg + score_head_pos}')
        print(f'loss_2: {margin - score_tail_neg + score_tail_pos}')
        loss_1 = torch.maximum(torch.tensor((0.)), margin - score_head_neg + score_head_pos)
        loss_2 = torch.maximum(torch.tensor((0.)), margin - score_tail_neg + score_tail_pos)
        # loss_1 = margin - score_head_neg + score_head_pos
        # loss_2 = margin - score_tail_neg + score_tail_pos
        return loss_1 + loss_2

#### Training

In [50]:
param = parse_args(args=['--data_dir', '../persistent/data',
                         '--entities_embed_fn', 'entity_500.npy',
                         '--relations_embed_fn', 'relation_500.npy',
                         '--struct_embed_size', '100',
                         '--text_embed_size', '300',
                         '--display_loss_step', '10',
                         '--margin', '10'])

# Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

# Set fixed random number seed
torch.manual_seed(28062022)

training_data = TrainDataSet(param.data_dir, param.onto_dir,
                             param.struct_embeds_fn, param.word_embeds_fn,
                             param.triples_fn, param.entities_fn,
                             param.entities_names_fn)
train_dataloader = DataLoader(training_data, batch_size=param.batch_size, shuffle=True)

model = EmbeddingModel(param.struct_embed_size, param.text_embed_size).to(device)

# loss_function

current_learning_rate = param.initial_learning_rate
optimizer = torch.optim.Adam(model.parameters(), lr=current_learning_rate)

initial_valid_loss = 100

training_losses = []

# Run the training loop
for epoch in range(param.training_epochs):

    # Print epoch
    print(f'Starting epoch {epoch+1}')

    # Set current loss value
    current_loss = 0.0

    # Iterate over the DataLoader for training data
    for i, data in enumerate(train_dataloader, 1):

        # Get inputs
        for d in data:
            d.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Perform forward pass
        outputs = model(data)

        # Compute loss
        loss = model.loss(param.margin, outputs)
        training_losses.append(loss)

        # Perform backward pass
        loss.backward()

        # Perform optimization
        optimizer.step()

        # Print statistics
        print(f'Loss: {loss.item()}')
        current_loss += loss.item()
        if i % param.display_loss_step == param.display_loss_step-1:
            print('Loss after mini-batch %5d: %.3f' % (i, current_loss / param.display_loss_step))
            current_loss = 0.0

# Extracting and saving the embeddings


Using cpu device
Number of triples: 547034
Number of entities_names: 123951

Starting epoch 1
Score head pos: tensor([3431.1870], grad_fn=<SumBackward1>)
Score head neg: tensor([3532.3328], grad_fn=<SumBackward1>)
Score tail pos: tensor([3431.1870], grad_fn=<SumBackward1>)
Score tail neg:tensor([3606.8984], grad_fn=<SumBackward1>)
loss_1: tensor([-91.1458], grad_fn=<AddBackward0>)
loss_2: tensor([-165.7114], grad_fn=<AddBackward0>)
Loss: 0.0
Score head pos: tensor([3426.9138], grad_fn=<SumBackward1>)
Score head neg: tensor([3498.9270], grad_fn=<SumBackward1>)
Score tail pos: tensor([3426.9138], grad_fn=<SumBackward1>)
Score tail neg:tensor([3590.9128], grad_fn=<SumBackward1>)
loss_1: tensor([-62.0132], grad_fn=<AddBackward0>)
loss_2: tensor([-153.9990], grad_fn=<AddBackward0>)
Loss: 0.0
Score head pos: tensor([3402.5374], grad_fn=<SumBackward1>)
Score head neg: tensor([3516.0481], grad_fn=<SumBackward1>)
Score tail pos: tensor([3402.5371], grad_fn=<SumBackward1>)
Score tail neg:tensor(

KeyboardInterrupt: 