In [1]:
# The purpose of this notebook is to Validate JSON Knowledge Graph against schema, uniqueness, and referential integrity.
# Each cell is generally recommended to use to validate patterns.
# Each cell will work for millions of nodes and rels, but if each node and rel are generated then is validation needed for each to SCHEMA?

In [2]:
# Import python3 modules
import os
import json as json
import jsonschema as jsonschema
import pandas as pd
from loky import get_reusable_executor

In [3]:
# Load JSON KG and JKG Schema from local directory - more than 1 min per GB to load 
with open('JKG.json') as file: JKG = json.load(file)
with open('JKG_Schema.json') as file: JKG_Schema = json.load(file)

In [4]:
# Divide JKG_Schema into top level and sub-schemas for parallel processing
nodes_schema = JKG_Schema['properties'].pop('nodes')
rels_schema = JKG_Schema['properties'].pop('rels')

In [5]:
# Validate JKG against top level of JKG_Schema
try:
    jsonschema.validate(JKG,JKG_Schema)
    print("JKG data has nodes and rels lists.")
except jsonschema.exceptions.ValidationError as e:
    print(f"JKG data is INVALID: {e.message}")

JKG data has nodes and rels lists.


In [6]:
# Validate node items to SCHEMA in parallel ~ one million nodes runs 3.5 minutes on Apple M1 Max with 32GB Memory
# YOU MAY NOT WANT TO RUN THIS CELL for 10 MILLIONS of Nodes
def validate_items(items,s,f):
    try:
        jsonschema.validate(items,nodes_schema)
        #print(f"Processed successfully nodes: {s} to {f}")
    except jsonschema.exceptions.ValidationError as e:
        print(f"Processing nodes: {s} to {f}")
        print(f"INVALID: node row: {e.json_path}, {e.message}")

max_index = len(JKG['nodes'])-1
executor = get_reusable_executor(max_workers=10, timeout=3)
print(f"Schema validation begins for nodes 0 to {max_index}")
print("In each 1000 nodes up to one INVALID node is flagged.")
for i in range(int(max_index / 1000)):
    s = i*1000
    f = s+1000
    executor.submit(validate_items,JKG['nodes'][s:f],s,f)
s = int(max_index / 1000) * 1000
f = s + (max_index % 1000)
executor.submit(validate_items,JKG['nodes'][s:f],s,f)

Schema validation begins for nodes 0 to 11025251
In each 1000 nodes up to one INVALID node is flagged.


<Future at 0x5a4000070 state=pending>

In [7]:
# Validate rel items to SCHEMA in parallel ~ one million rels runs 3.5 minutes on Apple M1 Max with 32GB Memory
# YOU MAY NOT WANT TO RUN THIS CELL for 10 MILLIONS of Rels
def validate_items(items,s,f):
    try:
        jsonschema.validate(items,rels_schema)
        #print(f"Processed successfully rels: {s} to {f}")
    except jsonschema.exceptions.ValidationError as e:
        print(f"Processing rels: {s} to {f}")
        print(f"INVALID: rel row: {e.json_path}, {e.message}")

max_index = len(JKG['rels'])-1
executor = get_reusable_executor(max_workers=10, timeout=3)
print(f"Schema validation begins for rels 0 to {max_index}")
print("In each 1000 rels up to one INVALID rel is flagged.")
for i in range(int(max_index / 1000)):
    s = i*1000
    f = s+1000
    executor.submit(validate_items,JKG['rels'][s:f],s,f)
s = int(max_index / 1000) * 1000
f = s + (max_index % 1000)
executor.submit(validate_items,JKG['rels'][s:f],s,f)

Schema validation begins for rels 0 to 21336318
In each 1000 rels up to one INVALID rel is flagged.


<Future at 0x755c679fd0 state=pending>

In [8]:
# Load rels and nodes into Data Frames
rels = pd.DataFrame(JKG['rels'])
nodes = pd.DataFrame(JKG['nodes'])

df = pd.json_normalize(rels)
starts = pd.json_normalize(rels.start)
ends = pd.json_normalize(rels.end)
df = pd.json_normalize(rels.properties)
df = pd.concat([rels.label.reset_index(drop=True),df.reset_index(drop=True)], axis=1)
rels = df

df = pd.json_normalize(nodes.properties)
df = pd.concat([nodes.labels.reset_index(drop=True),df.reset_index(drop=True)], axis=1)
nodes = df

IOStream.flush timed out
IOStream.flush timed out


In [9]:
# Validate Uniqueness of node ids, sabs, node_labels, rel_labels
# df is nodes carried forward from last notebook cell

# Check duplicate node id
duplicates = df[df.duplicated(subset=['id'], keep=False)]
if not duplicates.empty:
    print("The following node id are not unique:\n", duplicates)
else:
    print("All node id are unique.")
    
# Subset nodes to Source and Check duplicate sab
fdf = df[df['labels'].apply(lambda x: 'Source' in x)]
duplicates = fdf[fdf.duplicated(subset=['sab'], keep=False)]
if not duplicates.empty:
    print("The following Source sab are not unique:\n", duplicates)
else:
    print("All Source sab are unique.")
    
# Subset nodes to Node_Label and Check duplicate node_label
fdf = df[df['labels'].apply(lambda x: 'Node_Label' in x)]
duplicates = fdf[fdf.duplicated(subset=['node_label'], keep=False)]
if not duplicates.empty:
    print("The following Node_Label node_label are not unique:\n", duplicates)
else:
    print("All Node_Label node_label are unique.")
    
# Subset nodes to Rel_Label and Check duplicate rel_label
fdf = df[df['labels'].apply(lambda x: 'Rel_Label' in x)]
duplicates = fdf[fdf.duplicated(subset=['rel_label'], keep=False)]
if not duplicates.empty:
    print("The following Rel_Label rel_label are not unique:\n", duplicates)
else:
    print("All Rel_Label rel_label are unique.")

All node id are unique.
All Source sab are unique.
All Node_Label node_label are unique.
All Rel_Label rel_label are unique.


In [10]:
# Validate Referential Integrity
# Check if all values in list_a are in list_b
# missing_values = list_a[~list_a.isin(list_b)]

# Reports Node sab NOT in Source sab list
fdf = nodes[nodes['labels'].apply(lambda x: 'Source' in x)]
u_sab = pd.Series(fdf['sab'])
nodes_sab = pd.Series(nodes['sab'].unique()).dropna() # drop NaN because Term nodes have no sab
missing_values = nodes_sab[~nodes_sab.isin(u_sab)]
if not missing_values.empty:
    print(f"The following Node sab are not asserted as a Source sab:\n{missing_values}")
else:
    print("All Node sab are present in Source sab.")

# Reports Rel sab NOT in Source sab list - uses Source sab list u_sab from above
rels_sab = pd.Series(rels['sab'].unique())
missing_values = rels_sab[~rels_sab.isin(u_sab)]
if not missing_values.empty:
    print(f"The following Rel sab are not asserted as a Source sab:\n{missing_values}")
else:
    print("All Rel sab are present in Source sab.")

# Reports Concept other Labels NOT in node_label list with Concept added
fdf = nodes[nodes['labels'].apply(lambda x: 'Concept' in x)]
u_labels = pd.Series(fdf['labels'].explode().unique())
node_labels_concept = pd.concat([nodes.node_label, pd.Series(['Concept'])], ignore_index=True)
missing_values = u_labels[~u_labels.isin(node_labels_concept)]
if not missing_values.empty:
    print(f"The following Labels are not asserted as a node_label:\n{missing_values}")
else:
    print("All Concept Labels are present in node_label.")

# Reports Rel label NOT in rel_label list with CODE added
rel_labels_CODE = pd.concat([nodes.rel_label, pd.Series(['CODE'])], ignore_index=True)
u_labels = pd.Series(rels['label'].unique())
missing_values = u_labels[~u_labels.isin(rel_labels_CODE)]
if not missing_values.empty:
    print(f"The following Rel labels are not asserted as a rel_label:\n{ missing_values}")
else:
    print("All Rel labels are present in rel_label.")

# Reports start property.id of rels in node id list
u_labels = starts['properties.id']
missing_values = u_labels[~u_labels.isin(nodes.id)]
if not missing_values.empty:
    print(f"The following Rel start id are not asserted as a node id:\n{missing_values}")
else:
    print("All Rel start id are present in node id.")

# Reports end property.id of rels in node id list
u_labels = ends['properties.id']
missing_values = u_labels[~u_labels.isin(nodes.id)]
if not missing_values.empty:
    print(f"The following Rel end id are not asserted as a node id:\n{missing_values}")
else:
    print("All Rel end id are present in node id.")

All Node sab are present in Source sab.
All Rel sab are present in Source sab.
All Concept Labels are present in node_label.
All Rel labels are present in rel_label.
All Rel start id are present in node id.
The following Rel end id are not asserted as a node id:
141160     UMLS:C0949778
1428308    UMLS:C5234793
1612469    UMLS:C4082610
2436150    UMLS:C0949821
3335648    UMLS:C4300557
3812654    UMLS:C4082319
4603689    UMLS:C0031324
4635977    UMLS:C0949906
5333716    UMLS:C2825126
6198437    UMLS:C3179396
7507312    UMLS:C0949835
7544827    UMLS:C4082410
8465206    UMLS:C0949781
9163829    UMLS:C2825126
9448240    UMLS:C2348859
9616278    UMLS:C0949777
Name: properties.id, dtype: object
