## Create embeddings for semantic clustering

In [1]:
import json
import numpy as np
import spacy

### Node embedding algorithm

In [None]:
# Load triplets
with open('../Method:Knowledge_Graphs/data/triplets_no_cutoff/graphs.json') as f:
    results = json.load(f)

def embed_nodes(results, with_tag=True):
    print(f"Embedding nodes for with_tag={with_tag}")
    tags_dict = {
        'ORG': 0,
        'LOC': 1,
        'PER': 2,
        'MISC': 3
    }

    # Append all entities
    data = {}
    for root, triplets in results.items():
        nodes = []
        for triplet in triplets:
            head_tag = tags_dict.get(triplet[1], -1)
            tail_tag = tags_dict.get(triplet[3], -1)
            head = (triplet[0], head_tag)
            tail = (triplet[2], tail_tag)
            nodes.append(head)
            nodes.append(tail)
        data[root] = nodes

    # Embed entities
    nlp = spacy.load('en_core_web_md')

    data_emb = {}
    for root, triplets in data.items():
        triplets_emb = []
        for triplet in triplets:
            name = triplet[0]
            tag = triplet[1] if with_tag else None

            name_emb = nlp(name).vector # Convert to (300,1) dim embedding

            if with_tag:
                full_emb = np.concatenate([name_emb, np.array([tag])])
            else:
                full_emb = name_emb
            
            triplets_emb.append(full_emb)
        data_emb[root] = np.array(triplets_emb)

    tag_part = 'with_tags' if with_tag else 'no_tags'
    filename = f"data/node_embeddings_{tag_part}.npz"

    print(f'Saving node embeddings to {filename}')
    np.savez(filename, **data_emb)

##############################
for with_tag in [True, False]:
    embed_nodes(results, with_tag) 

In [None]:
# Get different aggregate embedding statistics for use later
def make_node_clustering_input(with_tags):
    data_emb = np.load(f'data/node_embeddings_{with_tags}_tags.npz')

    functions = {
        'min': np.min,
        'max': np.max,
        'mean': np.mean,
        'median': np.median
    }
    stats = {func_name: {} for func_name in functions}

    length = None

    for root, embeddings in data_emb.items():
        if embeddings.size == 0:
            # print(f"Warning: Company {root} has no embeddings.")
            continue
        
        for func_name, func in functions.items():
            stats[func_name][root] = func(embeddings, axis=0).tolist()
        length = embeddings.shape[1]

    filename = f'input/node_embeddings_{with_tags}_tags.json'
    print(f"Saving to {filename}, Embedding length: {length}")

    with open(filename, 'w') as f:
        json.dump(stats, f, indent=4)

##############################
for with_tags in ['no', 'with']:
    make_node_clustering_input(with_tags)

### Relation embedding algorithm

In [36]:
# Data prep
with open('../Method:Knowledge_Graphs/data/triplets_no_cutoff/graphs.json') as f:
    results = json.load(f)

data = {}
for root, triplets in results.items():
    relations = []
    for triplet in triplets:
        relation = triplet[-1]
        relations.append(relation)

    data[root] = relations

nlp = spacy.load('en_core_web_md')
data_emb = {}
for root, relations in data.items():
    relation_emb = []
    for relation in relations:

        rel_emb = nlp(relation).vector
        relation_emb.append(rel_emb)

    data_emb[root] = np.array(relation_emb)

np.savez('data/relation_embeddings', **data_emb)

In [None]:
def make_relation_clustering_input():
    data_emb = np.load('data/relation_embeddings.npz')
    company_embeddings = {}

    functions = {
        'min': np.min,
        'max': np.max,
        'mean': np.mean,
        'median': np.median
    }

    stats = {func_name: {} for func_name in functions}

    length = None

    for root, embeddings in data_emb.items():
        if embeddings.size == 0:
            # print(f"Warning: Company {root} has no embeddings.")
            continue
        
        for func_name, func in functions.items():
            stats[func_name][root] = func(embeddings, axis=0).tolist()
        length = embeddings.shape[1]
    
    filename = 'input/relation_embeddings.json'
    print(f"Saving to {filename}, Embedding length: {length}")

    with open(filename, 'w') as f:
        json.dump(stats, f, indent=4)
    
##############
make_relation_clustering_input()