# Experiment 4 - following directly the algorithm in the paper

In [731]:
import numpy as np
from rdflib import Graph, URIRef
from collections import defaultdict
import codecs

In [732]:
# Load ontology and RDF graphs
ontology_file_tbox = "data/updated/ub.nt"
ontology_file_abox = "data/lubm1_intact/all_lubm.nt"
input_graph_file = "data/lubm1_intact/graphs_with_descriptions/HTTP_www.Department0.University0.edu.nt"
input_graph_file = "data/lubm1_intact/graphs_with_descriptions/HTTP_www.Department0.University0.edu.nt"
inference_graph_file = "data/lubm1_intact/jena_inference_with_descriptions/HTTP_www.Department0.University0.edu.nt"

In [733]:
rdf_ontology = Graph()
rdf_ontology.parse(ontology_file_tbox, format="nt")
rdf_ontology.parse(ontology_file_abox, format="nt")

rdf_input_graph = Graph()
#rdf_input_graph.parse(input_graph_file, format="nt")
rdf_input_graph.parse(data=codecs.open(input_graph_file, encoding="UTF-8").read(), format="nt")


rdf_inference_graph = Graph()
rdf_inference_graph.parse(inference_graph_file, format="nt")

<Graph identifier=N608e29a9f0704213b85d16f374288c79 (<class 'rdflib.graph.Graph'>)>

In [734]:
# Step 1: Create properties dictionary
property_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?property WHERE {
  {
    ?subject rdf:type ?object .
    BIND(rdf:type as ?property)
  } UNION {
    ?property a owl:ObjectProperty .
  } UNION {
    ?property a owl:DatatypeProperty .
  } UNION {
    ?property a owl:TransitiveProperty .
  }
}
"""



In [735]:
# Define custom sorting function
def custom_sort(property_uri):
    property_str = str(property_uri)
    if property_str == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
        return ('', property_str)
    else:
        return (property_str,)

# Get the properties as a list and sort them
property_list = [row.property for row in rdf_ontology.query(property_query)]
sorted_properties = sorted(property_list, key=custom_sort)

# Generate the properties dictionary
properties_dictionary = {property: index+1 for index, property in enumerate(sorted_properties)}

In [736]:
len(properties_dictionary) ## SHOULD BE 32 - paper page 26 line 36

33

In [737]:
class_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?class_name WHERE {
  {
    ?class_name a rdfs:Class .
  } UNION {
    ?class_name a owl:Class .
  }
  FILTER(isURI(?class_name))
}
"""

global_resources_dictionary = {}
for i, row in enumerate(rdf_ontology.query(class_query)):
    global_resources_dictionary[row.class_name] = i

In [738]:
len(global_resources_dictionary) #SHOULD BE 57 - paper page 26 line 45

43

In [739]:
properties_dictionary

{rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#age'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom'): 6,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom'): 7,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#emailAddress'): 8,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus'): 9,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf'): 10,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse'): 11,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-be

# Adding Missing Base Classes

In [740]:
import rdflib

base_classes = [
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#List",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
    "http://www.w3.org/2000/01/rdf-schema#Class",
    "http://www.w3.org/2000/01/rdf-schema#Container",
    "http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty",
    "http://www.w3.org/2000/01/rdf-schema#Datatype",
    "http://www.w3.org/2000/01/rdf-schema#Literal",
    "http://www.w3.org/2000/01/rdf-schema#Resource",
    "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
    "http://www.w3.org/2001/XMLSchema#string"
]

for base_class in base_classes:
    if rdflib.URIRef(base_class) not in global_resources_dictionary:
        global_resources_dictionary[rdflib.URIRef(base_class)] = len(global_resources_dictionary)

In [741]:
len(global_resources_dictionary) #SHOULD BE 57 - paper page 26 line 45

58

In [742]:
global_resources_dictionary

{rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#AdministrativeStaff'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Article'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssistantProfessor'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssociateProfessor'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Book'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Chair'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Person'): 6,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#Department'): 7,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#ClericalStaff'): 8,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#College'): 9,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#ConferencePaper'): 10,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto

# I don't know WHICH ONE is wrong because all 58 of them are in his final file

# GLOBAL PROPERTIES AND RESOURCES DICTIONARIES DONE

In [743]:
# def add_resource(resource, global_resources_dictionary, local_resources_dictionary):
#     if resource in global_resources_dictionary or resource in local_resources_dictionary:
#         return
#     else:
#         local_resources_dictionary[resource] = len(local_resources_dictionary) + len(global_resources_dictionary)

In [744]:
# def lookup_resource(resource, global_resources_dictionary, local_resources_dictionary):
#     if resource in global_resources_dictionary:
#         return global_resources_dictionary[resource]
#     elif resource in local_resources_dictionary:
#         return local_resources_dictionary[resource]
#     else:
#         raise ValueError(f"Resource not found in either global or local dictionaries: {resource}")

In [745]:
# def encode(rdf_graph, global_resources_dictionary, local_resources_dictionary, properties_dictionary, is_inference):
#     sorted_triples = sorted(rdf_graph, key=lambda triple: triple[1])  # Sort triples by property
#
#     # Calculate the maximum possible size of the local_resources_dictionary
#     unique_subjects_objects = set()
#     for s, p, o in rdf_graph:
#         unique_subjects_objects.add(s)
#         unique_subjects_objects.add(o)
#     max_local_dictionary_size = len(unique_subjects_objects)
#
#     number_of_properties = len(properties_dictionary)
#     max_size = len(global_resources_dictionary) + max_local_dictionary_size
#     adjacency_matrix = np.zeros((number_of_properties, max_size, max_size))
#     encoding = {}
#     for s, p, o in sorted_triples:
#         if p not in properties_dictionary:
#             continue
#         p_id = properties_dictionary[p]
#         if not is_inference:
#             add_resource(s, global_resources_dictionary, local_resources_dictionary)
#             add_resource(o, global_resources_dictionary, local_resources_dictionary)
#         s_id = lookup_resource(s, global_resources_dictionary, local_resources_dictionary)
#         o_id = lookup_resource(o, global_resources_dictionary, local_resources_dictionary)
#         adjacency_matrix[p_id, s_id, o_id] = 1
#         if p_id not in encoding:
#                 encoding[p_id] = []
#         encoding[p_id].append((s_id, o_id))
#
#     return encoding, adjacency_matrix, local_resources_dictionary

In [746]:
# local_resources_dictionary = {}
#
# encoding, adjacency_matrix, local_resources_dictionary = encode(rdf_input_graph, global_resources_dictionary, local_resources_dictionary, properties_dictionary, is_inference=False)

# END OF "SIMPLE" ENCODING

In [747]:
import networkx as nx
from rdflib.plugins.sparql import prepareQuery

local_resources_dictionary = {}

subproperty_query = prepareQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?property1 ?property2
    WHERE {
        ?property1 rdfs:subPropertyOf ?property2
        FILTER(?property1 != ?property2)
    }
""")

subproperty_graph = nx.Graph()

for row in rdf_ontology.query(subproperty_query):
    subproperty_graph.add_edge(row.property1, row.property2)

In [748]:
subgraphs = list(nx.connected_components(subproperty_graph))

In [749]:
properties_groups = {}

for group_id, subgraph in enumerate(subgraphs):
    for property in subgraph:
        properties_groups[property] = group_id


In [750]:
# Find the maximum group_id used so far
max_group_id = max(properties_groups.values()) if properties_groups else -1

# Add the properties from properties_dictionary to the properties_groups
for property in properties_dictionary:
    if property not in properties_groups:
        max_group_id += 1
        properties_groups[property] = max_group_id

# PROPERTIES GROUPS DONE - START ENCODING

In [751]:
# GLOBAL PROPERTIES DICTIONARY - properties_dictionary - already exists
global_properties_dictionary = properties_dictionary
global_active_properties_dictionary = {}
# GLOBAL RESOURCES DICTIONARY - global_resources_dictionary - already exists
global_active_resources_dictionary = {}
# GLOBAL PROPERTY GROUPS DICTIONARY  - properties_groups
global_property_groups_dictionary = properties_groups # already exists
# LOCAL RESOURCES DICTIONARY [graph name] -> (local props, local resources)
local_resources_dictionary = {}

In [752]:
def add_resource(resource, property):
    if property not in global_active_properties_dictionary:
        global_active_properties_dictionary[property] = len(global_active_properties_dictionary) + 1
    property_group = global_property_groups_dictionary[property]

    if resource in global_resources_dictionary and resource not in global_active_resources_dictionary:
        global_active_resources_dictionary[resource] = len(global_active_properties_dictionary)
    else:
        if property_group not in local_resources_dictionary:
            local_resources_dictionary[property_group] = {}
        if resource not in local_resources_dictionary[property_group]:
            local_resources_dictionary[property_group][resource] = -(len(local_resources_dictionary[property_group]) + 1)

In [753]:
def lookup_resource(resource, property):
    property_group = global_property_groups_dictionary[property]
    if property_group in local_resources_dictionary and resource in local_resources_dictionary[property_group]:
        return local_resources_dictionary[property_group][resource]
    else:
        if resource in global_active_resources_dictionary:
            return global_active_resources_dictionary[resource]
        else:
            return None

In [754]:
def encode_advanced(rdf_graph):
    # Get the unique properties from the graph
    unique_properties = {p for s, p, o in rdf_graph}
    # Filter the external_property_list to keep only the properties present in unique_properties
    filtered_property_list = [property for property in global_properties_dictionary if URIRef(property) in unique_properties]
    # Create a mapping for property IDs based on their position (rank) in the filtered list
    property_id_map = {URIRef(property): index + 1 for index, property in enumerate(filtered_property_list)}
    # Sort the triples based on subject, property, and object
    sorted_triples = sorted(rdf_graph, key=lambda x: (property_id_map.get(x[1], float('inf')), x[0], x[2]))
    ##is_inference = True

    sparse_encoding = []
    encoding = {}
    print_counter = 0
    for s, p, o in sorted_triples:
        if p not in properties_dictionary:
            continue
        add_resource(s, p)
        add_resource(o, p)
        s_id = lookup_resource(s, p)
        o_id = lookup_resource(o, p)
        p_id = property_id_map[p]
        sparse_encoding.append((p_id, s_id, o_id))
        if p_id not in encoding:
            encoding[p_id] = []
        encoding[p_id].append((s_id, o_id))

    return encoding

In [755]:
encode_advanced(rdf_input_graph)

{1: [(-1, 1)],
 2: [(-1, -2)],
 3: [(-3, -2),
  (-4, -2),
  (-5, -2),
  (-6, -2),
  (-7, -2),
  (-8, -2),
  (-9, -2),
  (-10, -2),
  (-11, -2),
  (-12, -2),
  (-13, -2),
  (-14, -2),
  (-15, -2),
  (-16, -2),
  (-17, -2),
  (-18, -2),
  (-19, -2),
  (-20, -2),
  (-21, -2),
  (-22, -2),
  (-23, -2),
  (-24, -2),
  (-25, -2),
  (-26, -2),
  (-27, -2),
  (-28, -2),
  (-29, -2),
  (-30, -2),
  (-31, -2),
  (-32, -2),
  (-33, -2),
  (-34, -2),
  (-35, -2),
  (-36, -2),
  (-37, -2),
  (-38, -2),
  (-39, -2),
  (-40, -2),
  (-41, -2),
  (-42, -2),
  (-43, -2),
  (-44, -2),
  (-45, -2),
  (-46, -2),
  (-47, -2),
  (-48, -2),
  (-49, -2),
  (-50, -2),
  (-51, -2),
  (-52, -2),
  (-53, -2),
  (-54, -2),
  (-55, -2),
  (-56, -2),
  (-57, -2),
  (-58, -2),
  (-59, -2),
  (-60, -2),
  (-61, -2),
  (-62, -2),
  (-63, -2),
  (-64, -2),
  (-65, -2),
  (-66, -2),
  (-67, -2),
  (-68, -2),
  (-69, -2),
  (-70, -2),
  (-71, -2),
  (-72, -2),
  (-73, -2),
  (-74, -2),
  (-75, -2),
  (-76, -2),
  (-77, -2)

In [704]:
properties_groups

{rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#undergraduateDegreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom'): 0,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#worksFor'): 1,
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh

In [602]:
# Get the unique properties from the graph
unique_properties = {p for s, p, o in rdf_input_graph}
# Filter the external_property_list to keep only the properties present in unique_properties
filtered_property_list = [property for property in global_properties_dictionary if URIRef(property) in unique_properties]
# Create a mapping for property IDs based on their position (rank) in the filtered list
property_id_map = {URIRef(property): index + 1 for index, property in enumerate(filtered_property_list)}
# Sort the triples based on subject, property, and object
sorted_triples = sorted(rdf_input_graph, key=lambda x: (property_id_map.get(x[1], float('inf')), x[0], x[2]))

In [705]:
global_properties_dictionary

{rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 1,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor'): 2,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf'): 3,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf'): 4,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#age'): 5,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom'): 6,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom'): 7,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#emailAddress'): 8,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus'): 9,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf'): 10,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse'): 11,
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-be

In [608]:
filtered_property_list

[rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf'),
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf'),
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#worksFor'),
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#name'),
 rdflib.term.URIRef('http://swat.cse.lehigh.edu/onto/univ-bench.owl#subOrganizationOf')]