# Experiment 4 - following directly the algorithm in the paper

In [114]:
import numpy as np
from rdflib import Graph, URIRef
from collections import defaultdict
import codecs

In [115]:
# Load ontology and RDF graphs
ontology_file_tbox = "data/updated/ub.nt"
ontology_file_abox = "data/lubm1_intact/all_lubm.nt"
input_graph_file = "data/lubm1_intact/graphs_with_descriptions/HTTP_www.Department0.University0.edu.nt"
inference_graph_file = "data/lubm1_intact/jena_inference_with_descriptions/HTTP_www.Department0.University0.edu.nt"

In [116]:
rdf_ontology = Graph()
rdf_ontology.parse(ontology_file_tbox, format="nt")
rdf_ontology.parse(ontology_file_abox, format="nt")

rdf_input_graph = Graph()
#rdf_input_graph.parse(input_graph_file, format="nt")
rdf_input_graph.parse(data=codecs.open(input_graph_file, encoding="UTF-8").read(), format="nt")


rdf_inference_graph = Graph()
rdf_inference_graph.parse(inference_graph_file, format="nt")

<Graph identifier=Nb7553b52c45542608ce980e4f67b41e4 (<class 'rdflib.graph.Graph'>)>

In [117]:
# Step 1: Create properties dictionary
property_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?property WHERE {
  {
    ?subject rdf:type ?object .
    BIND(rdf:type as ?property)
  } UNION {
    ?property a owl:ObjectProperty .
  } UNION {
    ?property a owl:DatatypeProperty .
  } UNION {
    ?property a owl:TransitiveProperty .
  }
}
"""

In [118]:
properties_dictionary = {}
for i, row in enumerate(rdf_ontology.query(property_query)):
    properties_dictionary[row.property] = i

In [119]:
len(properties_dictionary) ## SHOULD BE 32 - paper page 26 line 36

33

In [120]:
class_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?class_name WHERE {
  {
    ?class_name a rdfs:Class .
  } UNION {
    ?class_name a owl:Class .
  }
  FILTER(isURI(?class_name))
}
"""

global_resources_dictionary = {}
for i, row in enumerate(rdf_ontology.query(class_query)):
    global_resources_dictionary[row.class_name] = i

In [121]:
len(global_resources_dictionary) #SHOULD BE 57 - paper page 26 line 45

43

# Adding Missing Base Classes

In [122]:
import rdflib

base_classes = [
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#List",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
    "http://www.w3.org/2000/01/rdf-schema#Class",
    "http://www.w3.org/2000/01/rdf-schema#Container",
    "http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty",
    "http://www.w3.org/2000/01/rdf-schema#Datatype",
    "http://www.w3.org/2000/01/rdf-schema#Literal",
    "http://www.w3.org/2000/01/rdf-schema#Resource",
    "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
    "http://www.w3.org/2001/XMLSchema#string"
]

for base_class in base_classes:
    if rdflib.URIRef(base_class) not in global_resources_dictionary:
        global_resources_dictionary[rdflib.URIRef(base_class)] = len(global_resources_dictionary)

In [123]:
len(global_resources_dictionary) #SHOULD BE 57 - paper page 26 line 45

58

In [124]:
global_resources_dictionary

{rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#AdministrativeStaff'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Article'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssistantProfessor'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssociateProfessor'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Book'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Chair'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Person'): 6,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Department'): 7,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#ClericalStaff'): 8,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#College'): 9,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#ConferencePaper'): 10,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto

# I don't know WHICH ONE is wrong because all 58 of them are in his final file

# GLOBAL PROPERTIES AND RESOURCES DICTIONARIES DONE

In [125]:
def add_resource(resource, global_resources_dictionary, local_resources_dictionary):
    if resource in global_resources_dictionary or resource in local_resources_dictionary:
        return
    else:
        local_resources_dictionary[resource] = len(local_resources_dictionary) + len(global_resources_dictionary)

In [126]:
def lookup_resource(resource, global_resources_dictionary, local_resources_dictionary):
    if resource in global_resources_dictionary:
        return global_resources_dictionary[resource]
    elif resource in local_resources_dictionary:
        return local_resources_dictionary[resource]
    else:
        raise ValueError(f"Resource not found in either global or local dictionaries: {resource}")

In [127]:
def encode(rdf_graph, global_resources_dictionary, local_resources_dictionary, properties_dictionary, is_inference):
    sorted_triples = sorted(rdf_graph, key=lambda triple: triple[1])  # Sort triples by property

    # Calculate the maximum possible size of the local_resources_dictionary
    unique_subjects_objects = set()
    for s, p, o in rdf_graph:
        unique_subjects_objects.add(s)
        unique_subjects_objects.add(o)
    max_local_dictionary_size = len(unique_subjects_objects)

    number_of_properties = len(properties_dictionary)
    max_size = len(global_resources_dictionary) + max_local_dictionary_size
    adjacency_matrix = np.zeros((number_of_properties, max_size, max_size))
    encoding = {}
    for s, p, o in sorted_triples:
        if p not in properties_dictionary:
            continue
        p_id = properties_dictionary[p]
        if not is_inference:
            add_resource(s, global_resources_dictionary, local_resources_dictionary)
            add_resource(o, global_resources_dictionary, local_resources_dictionary)
        s_id = lookup_resource(s, global_resources_dictionary, local_resources_dictionary)
        o_id = lookup_resource(o, global_resources_dictionary, local_resources_dictionary)
        adjacency_matrix[p_id, s_id, o_id] = 1
        if p_id not in encoding:
                encoding[p_id] = []
        encoding[p_id].append((s_id, o_id))

    return encoding, adjacency_matrix, local_resources_dictionary

In [128]:
# local_resources_dictionary = {}
#
# encoding, adjacency_matrix, local_resources_dictionary = encode(rdf_input_graph, global_resources_dictionary, local_resources_dictionary, properties_dictionary, is_inference=False)

# END OF "SIMPLE" ENCODING

In [129]:
import networkx as nx
from rdflib.plugins.sparql import prepareQuery

local_resources_dictionary = {}

subproperty_query = prepareQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?property1 ?property2
    WHERE {
        ?property1 rdfs:subPropertyOf ?property2
        FILTER(?property1 != ?property2)
    }
""")

subproperty_graph = nx.Graph()

for row in rdf_ontology.query(subproperty_query):
    subproperty_graph.add_edge(row.property1, row.property2)

In [130]:
subgraphs = list(nx.connected_components(subproperty_graph))

In [131]:
properties_groups = {}

for group_id, subgraph in enumerate(subgraphs):
    for property in subgraph:
        properties_groups[property] = group_id


In [132]:
# Find the maximum group_id used so far
max_group_id = max(properties_groups.values()) if properties_groups else -1

# Add the properties from properties_dictionary to the properties_groups
for property in properties_dictionary:
    if property not in properties_groups:
        max_group_id += 1
        properties_groups[property] = max_group_id

# PROPERTIES GROUPS DONE - START ENCODING

In [133]:
# def lookup_resource(resource, property, properties_groups, usable_global_resources_dictionary, local_resources_dictionaries):
#     property_group = properties_groups[property]
#     if resource in usable_global_resources_dictionary:
#         return usable_global_resources_dictionary[resource]
#     elif resource in local_resources_dictionaries[property_group]:
#         return local_resources_dictionaries[property_group][resource]
#     else:
#         raise ValueError(f"Resource {resource} not found in any dictionary")

In [134]:
# def encode_advanced(rdf_graph, properties_groups, global_resources_dictionary, properties_dictionary, is_inference):
#     # sorted_triples = sorted(rdf_graph, key=lambda triple: str(triple[1])) # Sort triples by property
#     sorted_triples = sorted(rdf_graph, key=lambda triple: properties_dictionary.get(triple[1], float('inf')))
#
#     usable_global_resources_dictionary = {}
#     local_resources_dictionaries = {group: {} for group in properties_groups.values()}
#
#     sparse_encoding = []
#     encoding = {}
#     for s, p, o in sorted_triples:
#         if p not in properties_dictionary:
#             continue
#         if not is_inference:
#             add_resource(s, p, properties_groups, usable_global_resources_dictionary, global_resources_dictionary, local_resources_dictionaries)
#             add_resource(o, p, properties_groups, usable_global_resources_dictionary, global_resources_dictionary, local_resources_dictionaries)
#         s_id = lookup_resource(s, p, properties_groups, usable_global_resources_dictionary, local_resources_dictionaries)
#         o_id = lookup_resource(o, p, properties_groups, usable_global_resources_dictionary, local_resources_dictionaries)
#         p_id = properties_dictionary[p]
#         sparse_encoding.append((p_id, s_id, o_id))
#         if p_id not in encoding:
#             encoding[p_id] = []
#         encoding[p_id].append((s_id, o_id))
#
#
#     return sorted_triples, encoding, sparse_encoding, local_resources_dictionaries

In [135]:
# sorted_triples, encoding, sparse_encoding, local_resources_dictionaries = encode_advanced(rdf_input_graph, properties_groups, global_resources_dictionary, properties_dictionary, is_inference=False)

In [136]:
# GLOBAL PROPERTIES DICTIONARY - properties_dictionary
global_active_properties_dictionary = {}
# GLOBAL RESOURCES DICTIONARY - global_resources_dictionary
global_active_resources_dictionary = {}
# GLOBAL PROPERTY GROUPS DICTIONARY  - properties_groups

# LOCAL RESOURCES DICTIONARY [graph name] -> (local props, local resources)
local_resources_dictionary = {}

In [137]:
def add_resource(resource, property):
    property_group = properties_groups[property]
    if resource in global_active_resources_dictionary:
        return
    elif resource in global_resources_dictionary:
        global_active_resources_dictionary[resource] = len(global_active_resources_dictionary)
    elif resource not in local_resources_dictionary[property_group]:
        local_resources_dictionary[property_group][resource] = -len(local_resources_dictionary[property_group][resource])

In [138]:
def lookup_resource(resource, property):
    property_group = properties_groups[property]
    if resource in global_active_resources_dictionary:
        return global_active_resources_dictionary[resource]
    elif resource in local_resources_dictionary[property_group]:
        return local_resources_dictionary[property_group][resource]

In [161]:
import collections

def encode_advanced(rdf_graph):
    triples_list = list(rdf_graph)
    triples_list_sorted = []
    triples_list_sorted_2 = []
    max_id = 0
    #    property_for_id = properties_dictionary[triples_list[0][1]]
    for s,p,o in triples_list:
        property_ID = properties_dictionary[p]
        if property_ID > max_id:
            max_id = property_ID
        triples_list_sorted.append([s,p,o,property_ID])

    for x in range(max_id):
        for triple in triples_list_sorted:
            print(triple)
#            if triple[3] == x:
 #               triples_list_sorted.append([triple[0], triple[1], triple[2]])

#   print(sorted_triples)
    sparse_encoding = []
    encoding = {}
    # for s, p, o in sorted_triples:
    #     if p not in properties_dictionary:
    #         continue
    #     if not is_inference:
    #         add_resource(s, p, properties_groups, usable_global_resources_dictionary, global_resources_dictionary, local_resources_dictionaries)
    #         add_resource(o, p, properties_groups, usable_global_resources_dictionary, global_resources_dictionary, local_resources_dictionaries)
    #     s_id = lookup_resource(s, p, properties_groups, usable_global_resources_dictionary, local_resources_dictionaries)
    #     o_id = lookup_resource(o, p, properties_groups, usable_global_resources_dictionary, local_resources_dictionaries)
    #     p_id = properties_dictionary[p]
    #     sparse_encoding.append((p_id, s_id, o_id))
    #     if p_id not in encoding:
    #         encoding[p_id] = []
    #     encoding[p_id].append((s_id, o_id))


    return 0

In [162]:
encode_advanced(rdf_input_graph)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[rdflib.term.URIRef('http://www.Department0.University0.edu/GraduateStudent8'), rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'), rdflib.term.URIRef('http://www.Department0.University0.edu'), 11]
[rdflib.term.URIRef('http://www.Department0.University0.edu/UndergraduateStudent77'), rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'), rdflib.term.URIRef('http://www.Department0.University0.edu'), 11]
[rdflib.term.URIRef('http://www.Department0.University0.edu/UndergraduateStudent473'), rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'), rdflib.term.URIRef('http://www.Department0.University0.edu'), 11]
[rdflib.term.URIRef('http://www.Department0.University0.edu/UndergraduateStudent73'), rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'), rdflib.term.URIRef('http://www.Department0.University0.edu'), 11]
[rdflib.term.URIRef('http://www.Department0.University0.edu/UndergraduateStudent5

0

In [89]:
properties_groups

{rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#undergraduateDegreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#worksFor'): 1,
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh

{rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus'): 6,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf'): 7,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse'): 8,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom'): 9,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#member'): 10,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/