## Adjacency Matrices for the Clustering Exercies

In [60]:
import json
import numpy as np
import networkx as nx
import importlib

import helper_functions as hf
importlib.reload(hf)

<module 'helper_functions' from '/Users/tiril/Documents/IndividualProject/nuclear_repo/knowledge_graphs/helper_functions.py'>

In [38]:
base_directory = 'data/triplets_no_cutoff'
save_directory = 'graphs/triplets_no_cutoff'

with open(base_directory + '/graphs.json', 'r') as file:
    results = json.load(file)

In [5]:
# Make relations dict
relations = set()
relations_dict = {}

for name, triplets in results.items():
    for triplet in triplets:
        _, _, _, _, rel = triplet
        relations.add(rel)

relations = list(relations)

for i in range(len((relations))):
    relations_dict[relations[i]] = i

for k, v in relations_dict.items():
    print(f"{k}: {v}")

residence: 0
country of origin: 1
founded by: 2
contains division: 3
division: 4
capital: 5
industry: 6
company: 7
ethnicity: 8
place of death: 9
nationality: 10
contains: 11
company location: 12
ethnic background: 13
child: 14
shareholder of: 15
neighborhood: 16
place of birth: 17


In [6]:
# Reduce dimensionality of relations (keep granularity only for 'important' organisational relations)
relations_dict = {

    # Organisational
    'division': 1,
    'contains division': 1,
    'contains': 2,
    'shareholder of': 3,
    'founded by': 4,
    'company': 5,
    'industry': 6,
    'company location': 7,

    # Other 
    'child': 0,
    'place of death': 0,
    'residence': 0,
    'neighborhood': 0,
    'capital': 0,
    'ethnic background': 0,
    'ethnicity': 0,
    'nationality': 0,
    'country of origin': 0,
    'place of birth': 0,
}

In [58]:
# Formula for adjacency matrix creation
def make_adj_matrices(results, relations_dict, max_depth=3, direction='bidirectional', include_entity_type=False):

    adj_matrices = {}

    # Make nodes dict
    print("Creating nodes dictionary...")
    unique_nodes = set()
    nodes_dict = {}
    roots = []

    for root, triplets in results.items():
        root_node = (root, 'ORG')
        try:
            G = hf.make_graph(triplets)
            G_pruned = hf.prune_graph_by_depth(G, root_node, max_depth, direction)
            for node in G_pruned.nodes():
                if include_entity_type:
                    unique_nodes.add(f"{node[0]}::{node[1]}")
                else:
                    unique_nodes.add(node[0])
            roots.append(root)
        except nx.NodeNotFound:
            print(f'Skipping for {root_node}')
            continue
        except hf.EmptyGraphError:
            print(f'Skipping for {root_node}')

    for i, node in enumerate(unique_nodes):
        nodes_dict[node] = i
    
    print(f"Nodes dictioary of length {len(nodes_dict)}")
    
    # Next, create adjacency matrices
    for root in roots:
        triplets = results[root]
        root_node = (root, 'ORG')
        #print(f'Creating adjacency matrices for {root} (ORG)')

        # 1. Create graph and prune it
        G = hf.make_graph(triplets)
        G_pruned = hf.prune_graph_by_depth(G, root_node, max_depth, direction)

        D = len(set(relations_dict.values())) # Indicates number of adjacency matrices used
        num_nodes = len(nodes_dict)

        # 2. Initialise empty matrices
        adj_matrix = np.zeros((num_nodes, num_nodes, D))

        # 3. Get edges (aggregate in case of multiple)
        for u, v, data in G_pruned.edges(data=True):
            if include_entity_type:
                head_with_ent = f"{u[0]}::{u[1]}"
                tail_with_ent = f"{v[0]}::{v[1]}"
            else:
                head_with_ent = u[0]
                tail_with_ent = v[0]

            relation_name = data.get('relation')

            relation_idx = relations_dict[relation_name]
            u_idx = nodes_dict[head_with_ent]
            v_idx = nodes_dict[tail_with_ent]

            adj_matrix[u_idx, v_idx, relation_idx] = 1
        
        adj_matrices[root] = adj_matrix
    
    return adj_matrices

In [79]:
# Make nodes dict (depending on depth of prune and whether entity tags are included)
base_directory = 'data/triplets_no_cutoff'
with open(base_directory + '/graphs.json', 'r') as file:
    results = json.load(file)

prune_on = 3
include_entity_type = False
direction = 'bidirectional'

adj_matrices = make_adj_matrices(results, relations_dict, prune_on, direction, include_entity_type=include_entity_type)
print(f"Shape: {adj_matrices['ARC'].shape}")

filepath = base_directory + f'/adj_matrices_reduced_rel_dim_pruned_{prune_on}_' + ('with' if include_entity_type else 'no') + '_tags.npz'
print(f"Saving to {filepath}")
np.savez_compressed(filepath, **adj_matrices)

Creating nodes dictionary...
Skipping for ('Elysium', 'ORG')
Skipping for ('HolosGen', 'ORG')
Skipping for ('Hyperion Power', 'ORG')
Skipping for ('StarCore Nuclear', 'ORG')
Skipping for ('Terrestial', 'ORG')
Nodes dictioary of length 1525
Shape: (1525, 1525, 8)
Saving to data/triplets_no_cutoff/adj_matrices_reduced_rel_dim_pruned_3_no_tags.npz


In [70]:
# Check that empty graphs aren't included (e.g. Elysium)
for i, (k, v) in enumerate(adj_matrices.items()):
    print(i+1, k, len(v))

1 ARC 2590
2 Babcock and Wilcox 2590
3 Berkeley 2590
4 BWX 2590
5 Flibe 2590
6 Framatome 2590
7 GE Hitachi 2590
8 General Atomics 2590
9 Holtec International 2590
10 Kairos Power 2590
11 Moltex Energy 2590
12 NANO Nuclear 2590
13 NuScale 2590
14 Oak Ridge National Laboratory 2590
15 Oklo 2590
16 TerraPower 2590
17 ThorCon 2590
18 Ultra Safe Nuclear Corporation 2590
19 Westinghouse 2590
20 X-Energy 2590
