# Experiment 5 - following directly the algorithm in the paper, using existing class

In [1]:
import os

import pandas as pd
from rdflib import Graph

from utils import ResourceDictionary


In [2]:
def get_nt_files(directory):
    nt_files = []
    for file in os.listdir(directory):
        if file.endswith(".nt"):
            nt_files.append(os.path.join(directory, file))
    return nt_files


def nt_files_to_dataframe(directory):
    nt_files = get_nt_files(directory)
    nt_files_df = pd.DataFrame(nt_files, columns=["nt_file"])
    return nt_files_df


directory = "data/lubm1_intact/graphs_with_descriptions"
nt_files_df = nt_files_to_dataframe(directory)
nt_files_df = nt_files_df.sort_values(by="nt_file").reset_index().drop(columns='index')

In [3]:
nt_files_df.head(10)

Unnamed: 0,nt_file
0,data/lubm1_intact/graphs_with_descriptions/HTT...
1,data/lubm1_intact/graphs_with_descriptions/HTT...
2,data/lubm1_intact/graphs_with_descriptions/HTT...
3,data/lubm1_intact/graphs_with_descriptions/HTT...
4,data/lubm1_intact/graphs_with_descriptions/HTT...
5,data/lubm1_intact/graphs_with_descriptions/HTT...
6,data/lubm1_intact/graphs_with_descriptions/HTT...
7,data/lubm1_intact/graphs_with_descriptions/HTT...
8,data/lubm1_intact/graphs_with_descriptions/HTT...
9,data/lubm1_intact/graphs_with_descriptions/HTT...


In [4]:
import ast


def read_csv_extract_columns(file_path, columns):
    data = pd.read_csv(file_path)
    extracted_data = data[columns]
    return extracted_data


csv_file_path = "code/encoding.csv"
columns = ["input_graph_file", "input_graph_encoding", "inference_file", "inference_graph_encoding"]
extracted_data = read_csv_extract_columns(csv_file_path, columns)
extracted_data['input_graph_file'] = extracted_data['input_graph_file'].str.replace('^../', '')
extracted_data['inference_file'] = extracted_data['inference_file'].str.replace('^../', '')
extracted_data.input_graph_encoding = extracted_data.input_graph_encoding.apply(lambda x: ast.literal_eval(x))
extracted_data.inference_graph_encoding = extracted_data.inference_graph_encoding.apply(lambda x: ast.literal_eval(x))

  del sys.path[0]
  


In [5]:
extracted_data.head(10)

Unnamed: 0,input_graph_file,input_graph_encoding,inference_file,inference_graph_encoding
0,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 1)], 2: [(-1, -2)], 3: [(-3, -2), (-...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-1, 2), (-682, 2), (-683, 2), (-684, 2),..."
1,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 3)], 7: [(-1, -2), (-3, -2), (-4, -2...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-1, 4), (-1, 5), (-1, 6), (-1, 7), (-19,..."
2,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6), (-3, 6), (-4, 6)]}"
3,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2)]}",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6)]}"
4,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6), (-3, 6)]}"
5,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6), (-3, 6), (-4, 6)]}"
6,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6), (-3, 6)]}"
7,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2)]}",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6)]}"
8,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 3)], 7: [(-1, -2), (-3, -2), (-4, -2...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-1, 4), (-1, 5), (-1, 6), (-1, 7), (-15,..."
9,data/lubm1_intact/graphs_with_descriptions/HTT...,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...",data/lubm1_intact/jena_inference_with_descript...,"{1: [(-2, 6), (-3, 6)]}"


In [6]:
nt_files_df.nt_file[0]

'data/lubm1_intact/graphs_with_descriptions/HTTP_www.Department0.University0.edu.nt'

In [7]:
# Load ontology and RDF graphs
ontology_file_tbox = "data/updated/ub.nt"
ontology_file_abox = "data/lubm1_intact/all_lubm.nt"
inference_graph_file = "data/lubm1_intact/jena_inference_with_descriptions/HTTP_www.Department0.University0.edu.nt"

rdf_ontology = Graph()
rdf_ontology.parse(ontology_file_tbox, format="nt")
rdf_ontology.parse(ontology_file_abox, format="nt")
rdf_inference_graph = Graph()
rdf_inference_graph.parse(inference_graph_file, format="nt")

<Graph identifier=N24a7f2ee1ca94d88947828b4c1a465d4 (<class 'rdflib.graph.Graph'>)>

In [8]:
def create_graphs_from_files(dataframe, file_column):
    graphs = []
    for file_path in dataframe[file_column]:
        g = Graph()
        full_path = os.path.abspath(file_path)
        g.parse(full_path, format='nt')
        graphs.append(g)
    return graphs


extracted_data['graph'] = create_graphs_from_files(extracted_data, 'input_graph_file')
extracted_data['inference_graph'] = create_graphs_from_files(extracted_data, 'inference_file')

In [9]:
extracted_data.graph[0]

<Graph identifier=Nebd474c91ccd470e84a01bf6c0e37bcf (<class 'rdflib.graph.Graph'>)>

In [10]:
# Step 1: Create properties dictionary
property_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?property WHERE {
  {
    ?subject rdf:type ?object .
    BIND(rdf:type as ?property)
  } UNION {
    ?property a owl:ObjectProperty .
  } UNION {
    ?property a owl:DatatypeProperty .
  } UNION {
    ?property a owl:TransitiveProperty .
  }
}
"""

In [11]:
# Define custom sorting function
def custom_sort(property_uri):
    property_str = str(property_uri)
    if property_str == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type':
        return ('', property_str)
    else:
        return (property_str,)


# Get the properties as a list and sort them
property_list = [row.property for row in rdf_ontology.query(property_query)]
sorted_properties = sorted(property_list, key=custom_sort)

# Generate the properties dictionary
#properties_dictionary = {property: index + 1 for index, property in enumerate(sorted_properties)}
properties_dictionary = ResourceDictionary()

In [12]:
len(properties_dictionary)  ## SHOULD BE 32 +1 - paper page 26 line 36

0

In [13]:
import rdflib

In [14]:
additional_properties_2 = ["http://www.w3.org/1999/02/22-rdf-syntax-ns#_1",
                           "http://www.w3.org/1999/02/22-rdf-syntax-ns#first",
                           "http://www.w3.org/1999/02/22-rdf-syntax-ns#object",
                           "http://www.w3.org/1999/02/22-rdf-syntax-ns#predicate",
                           "http://www.w3.org/1999/02/22-rdf-syntax-ns#rest",
                           "http://www.w3.org/1999/02/22-rdf-syntax-ns#subject",
                           "http://www.w3.org/1999/02/22-rdf-syntax-ns#value",
                           "http://www.w3.org/2000/01/rdf-schema#comment",
                           "http://www.w3.org/2000/01/rdf-schema#domain",
                           "http://www.w3.org/2000/01/rdf-schema#isDefinedBy",
                           "http://www.w3.org/2000/01/rdf-schema#label", "http://www.w3.org/2000/01/rdf-schema#member",
                           "http://www.w3.org/2000/01/rdf-schema#range", "http://www.w3.org/2000/01/rdf-schema#seeAlso",
                           "http://www.w3.org/2000/01/rdf-schema#subClassOf",
                           "http://www.w3.org/2000/01/rdf-schema#subPropertyOf",
                           "http://www.w3.org/2002/07/owl#imports",
                           "http://www.w3.org/2002/07/owl#intersectionOf", "http://www.w3.org/2002/07/owl#inverseOf",
                           "http://www.w3.org/2002/07/owl#onProperty", "http://www.w3.org/2002/07/owl#someValuesFrom",
                           "http://www.w3.org/2002/07/owl#versionInfo"]

In [15]:
additional_properties = []

In [16]:
for property in additional_properties_2:
    additional_properties.append(rdflib.URIRef(property))

In [17]:
for property in sorted_properties:
    if str(property) != "http://swat.cse.lehigh.edu/onto/univ-bench.owl#officeNumber":
        properties_dictionary.add(property)

for property in additional_properties:
    properties_dictionary.add(property)

In [18]:
for key in properties_dictionary.getKeys():
    print(key)

http://www.w3.org/1999/02/22-rdf-syntax-ns#type
http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf
http://swat.cse.lehigh.edu/onto/univ-bench.owl#age
http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom
http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom
http://swat.cse.lehigh.edu/onto/univ-bench.owl#emailAddress
http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus
http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf
http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse
http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom
http://swat.cse.lehigh.edu/onto/univ-bench.owl#member
http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf
http://swat.cse.lehigh.edu/onto/univ-bench.owl#name
http://swat.cse.lehigh.edu/onto/univ-bench.owl#orgPublication
http://swat.cse.lehigh.edu/onto/univ-bench.owl#publicationAuthor
http:

In [19]:
# class_query = """
# PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
# PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
# PREFIX owl: <http://www.w3.org/2002/07/owl#>
# SELECT DISTINCT ?class_name WHERE {
#   {
#     ?class_name a rdfs:Class .
#   } UNION {
#     ?class_name a owl:Class .
#   }
#   FILTER(isURI(?class_name))
# }
# """
#
# unique_class_names = set(row.class_name for row in rdf_ontology.query(class_query))
# sorted_class_names = sorted(unique_class_names, key=lambda x: str(x))
#
# global_resources_dictionary = {class_name: index + 1 for index, class_name in enumerate(sorted_class_names)}

In [20]:
class_query = """
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX owl: <http://www.w3.org/2002/07/owl#>
SELECT DISTINCT ?class_name WHERE {
  {
    ?class_name a rdfs:Class .
  } UNION {
    ?class_name a owl:Class .
  }
  FILTER(isURI(?class_name))
}
"""

global_resources_dictionary = ResourceDictionary()
for i, row in enumerate(rdf_ontology.query(class_query)):
    global_resources_dictionary.add(row.class_name)

In [21]:
len(global_resources_dictionary)  #SHOULD BE 57 - paper page 26 line 45

43

In [22]:
global_resources_dictionary

<utils.ResourceDictionary at 0x7f4886d7dd50>

# Adding Missing Base Classes

In [23]:
import rdflib

base_classes = [
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Alt",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Bag",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#List",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Property",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Seq",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#Statement",
    "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral",
    "http://www.w3.org/2000/01/rdf-schema#Class",
    "http://www.w3.org/2000/01/rdf-schema#Container",
    "http://www.w3.org/2000/01/rdf-schema#ContainerMembershipProperty",
    "http://www.w3.org/2000/01/rdf-schema#Datatype",
    "http://www.w3.org/2000/01/rdf-schema#Literal",
    "http://www.w3.org/2000/01/rdf-schema#Resource",
    "http://www.w3.org/2001/XMLSchema#nonNegativeInteger",
    "http://www.w3.org/2001/XMLSchema#string"
]

for base_class in base_classes:
    if rdflib.URIRef(base_class) not in global_resources_dictionary:
        #global_resources_dictionary[rdflib.URIRef(base_class)] = len(global_resources_dictionary)
        global_resources_dictionary.add(base_class)

In [24]:
len(global_resources_dictionary)  #SHOULD BE 57 - paper page 26 line 45

58

In [25]:
for key in global_resources_dictionary.getKeys():
    print(key)

http://swat.cse.lehigh.edu/onto/univ-bench.owl#AdministrativeStaff
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Article
http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssistantProfessor
http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssociateProfessor
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Book
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Chair
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Person
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Department
http://swat.cse.lehigh.edu/onto/univ-bench.owl#ClericalStaff
http://swat.cse.lehigh.edu/onto/univ-bench.owl#College
http://swat.cse.lehigh.edu/onto/univ-bench.owl#ConferencePaper
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Course
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Dean
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Director
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Program
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Employee
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Organization
http://swat.cse.l

# I don't know which one is wrong because all 58 of them are in zipped file

# GLOBAL PROPERTIES AND RESOURCES DICTIONARIES DONE

In [26]:
# import networkx as nx
# from rdflib.plugins.sparql import prepareQuery
#
# subproperty_query = prepareQuery("""
#     PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
#     SELECT ?property1 ?property2
#     WHERE {
#         ?property1 rdfs:subPropertyOf ?property2
#         FILTER(?property1 != ?property2)
#     }
# """)
#
# subproperty_graph = nx.Graph()
#
# for row in rdf_ontology.query(subproperty_query):
#     subproperty_graph.add_edge(row.property1, row.property2)

In [27]:
import networkx as nx
from rdflib.plugins.sparql import prepareQuery

subproperty_query = prepareQuery("""
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT ?property1 ?property2
    WHERE {
        ?property1 rdfs:subPropertyOf ?property2
        FILTER(?property1 != ?property2)
    }
""")

subproperty_graph = nx.Graph()

sub_properties_dict = {}

pairs = []

for row in rdf_ontology.query(subproperty_query):
    pairs.append((row.property1, row.property2))

for (property1, property2) in pairs:
    if property1 not in sub_properties_dict:
        sub_properties_dict[property1] = []
    sub_properties_dict[property1].append(property2)

properties_connected_components = {}

G = nx.Graph()
for property1 in sub_properties_dict:
    for property2 in sub_properties_dict[property1]:
        G.add_edge(property1, property2)
for property_uri in properties_dictionary:
    G.add_node(property_uri)
properties_connected_components = {}
index = 0
for c in nx.connected_components(G):
    for p in c:
        properties_connected_components[p] = index
    index += 1

In [28]:
for key, value in properties_connected_components.items():
    print(key, value)

http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#undergraduateDegreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#worksFor 1
http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf 1
http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf 1
http://www.w3.org/1999/02/22-rdf-syntax-ns#type 2
http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor 3
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf 4
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf 5
http://swat.cse.lehigh.edu/onto/univ-bench.owl#age 6
http://swat.cse.lehigh.edu/onto/univ-bench.owl#emailAddress 7
http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus 8
http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse 9
http://swat.cse.lehigh.edu/onto/univ-bench.owl#member 10
http://swat.cse.lehigh.e

In [29]:
# subgraphs = list(nx.connected_components(subproperty_graph))

In [30]:
# properties_groups = {}
#
# for group_id, subgraph in enumerate(subgraphs):
#     for property in subgraph:
#         properties_groups[property] = group_id


In [31]:
# # Find the maximum group_id used so far
# max_group_id = max(properties_groups.values()) if properties_groups else -1
#
# # Add the properties from properties_dictionary to the properties_groups
# for property in properties_dictionary:
#     if property not in properties_groups:
#         max_group_id += 1
#         properties_groups[property] = max_group_id

In [32]:
properties_groups = properties_connected_components

# TESTING DICTIONARIES

In [33]:
for key in properties_dictionary:
    print(key, properties_dictionary[key])

http://www.w3.org/1999/02/22-rdf-syntax-ns#type 1
http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor 2
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf 3
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf 4
http://swat.cse.lehigh.edu/onto/univ-bench.owl#age 5
http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom 6
http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom 7
http://swat.cse.lehigh.edu/onto/univ-bench.owl#emailAddress 8
http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus 9
http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf 10
http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse 11
http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom 12
http://swat.cse.lehigh.edu/onto/univ-bench.owl#member 13
http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf 14
http://swat.cse.lehigh.edu/onto/univ-bench.owl#name 15
http://swat.cse.lehigh.edu/onto/univ-bench.owl#orgPublication 16
http://swat.cse.lehigh.edu/onto

In [34]:
for key in global_resources_dictionary:
    print(key, global_resources_dictionary[key])

http://swat.cse.lehigh.edu/onto/univ-bench.owl#AdministrativeStaff 1
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Article 2
http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssistantProfessor 3
http://swat.cse.lehigh.edu/onto/univ-bench.owl#AssociateProfessor 4
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Book 5
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Chair 6
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Person 7
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Department 8
http://swat.cse.lehigh.edu/onto/univ-bench.owl#ClericalStaff 9
http://swat.cse.lehigh.edu/onto/univ-bench.owl#College 10
http://swat.cse.lehigh.edu/onto/univ-bench.owl#ConferencePaper 11
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Course 12
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Dean 13
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Director 14
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Program 15
http://swat.cse.lehigh.edu/onto/univ-bench.owl#Employee 16
http://swat.cse.lehigh.edu/onto/univ-b

In [35]:
for key in properties_groups:
    print(key, properties_groups[key])

http://swat.cse.lehigh.edu/onto/univ-bench.owl#mastersDegreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#degreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#doctoralDegreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#undergraduateDegreeFrom 0
http://swat.cse.lehigh.edu/onto/univ-bench.owl#worksFor 1
http://swat.cse.lehigh.edu/onto/univ-bench.owl#memberOf 1
http://swat.cse.lehigh.edu/onto/univ-bench.owl#headOf 1
http://www.w3.org/1999/02/22-rdf-syntax-ns#type 2
http://swat.cse.lehigh.edu/onto/univ-bench.owl#advisor 3
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliateOf 4
http://swat.cse.lehigh.edu/onto/univ-bench.owl#affiliatedOrganizationOf 5
http://swat.cse.lehigh.edu/onto/univ-bench.owl#age 6
http://swat.cse.lehigh.edu/onto/univ-bench.owl#emailAddress 7
http://swat.cse.lehigh.edu/onto/univ-bench.owl#hasAlumnus 8
http://swat.cse.lehigh.edu/onto/univ-bench.owl#listedCourse 9
http://swat.cse.lehigh.edu/onto/univ-bench.owl#member 10
http://swat.cse.lehigh.e

# PROPERTIES GROUPS DONE - CREATE CATALOGUE

In [36]:
import re


def get_lubm_graph_type(s):
    s = s.split('/')[-1]
    s = s.split('_')[-1]
    m = re.search("\d", s)
    return s[:m.start()]

In [37]:
df_catalogue = pd.DataFrame()
df_catalogue["input_graph_file"] = extracted_data.input_graph_file
df_catalogue["inference_file"] = extracted_data.inference_file

df_catalogue["graph_type"] = df_catalogue.input_graph_file.apply(get_lubm_graph_type)


In [38]:
df_catalogue.head(5)

Unnamed: 0,input_graph_file,inference_file,graph_type
0,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,www.Department
1,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,AssistantProfessor
2,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,Publication
3,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,Publication
4,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,Publication


# CATALOGUE DONE - START ENCODING

In [39]:
# GLOBAL PROPERTIES DICTIONARY - properties_dictionary - already exists
global_properties_dictionary = properties_dictionary  #ResourceDictionary

global_active_properties_dictionary = {}
# GLOBAL RESOURCES DICTIONARY - global_resources_dictionary - already exists #RESOURCEDICTIONARY

global_active_resources_dictionary = {}
# GLOBAL PROPERTY GROUPS DICTIONARY  - properties_groups
global_property_groups_dictionary = properties_groups  # already exists
# LOCAL RESOURCES DICTIONARY [graph name] -> (local props, local resources)
# local_resources_dictionaries = {}
global_local_resources_dictionary = {}
global_property_IDs = {}

In [40]:
from graph_words_encoder import GraphWordsEncoder

In [41]:
graph_words_encoder = GraphWordsEncoder(global_properties_dictionary, global_property_groups_dictionary,
                                        global_resources_dictionary)

# ENCODING DONE - START TESTING

In [42]:
encodings = []
encodings_inference = []

graphs_test = []
inference_test = []
number_of_tests = len(extracted_data.graph)


def compare_dictionaries(dict1, dict2):
    mismatched_pairs = []
    for key in set(dict1.keys()).union(dict2.keys()):
        if key in dict1 and key in dict2:
            if dict1[key] != dict2[key]:
                mismatched_pairs.append((key, dict1[key], dict2[key]))
        else:
            missing_in = "dict1" if key not in dict1 else "dict2"
            print(f"Key {key} is missing in {missing_in}")
    return mismatched_pairs


for i in range(number_of_tests):
    graphs_test.append(extracted_data.graph[i])
    inference_test.append(extracted_data.inference_graph[i])

print(f"Encoding {number_of_tests} graphs")

for i in range(number_of_tests):
    encoding, resources_dictionary = graph_words_encoder.encode_graph(graphs_test[i], inference=False)
    encoding_inference, _ = graph_words_encoder.encode_graph(inference_test[i], resources_dictionary, inference=True)
    encodings.append(encoding)
    encodings_inference.append(encoding_inference)


Encoding 17174 graphs


In [43]:
for z in range(number_of_tests):
    error_count = 0
    # print(f"# Testing encoding for graph, layer:{z} = `{extracted_data.input_graph_file[z].split('/')[-1]}`\n")
    for i in range(2):
        if i == 0:
            # print("## Testing encoding for input graph\n")
            encoding1 = encodings[z]
            encoding2 = extracted_data.input_graph_encoding[z]
        else:
            # print("## Testing encoding for inference graph\n")
            encoding1 = encodings_inference[z]
            encoding2 = extracted_data.inference_graph_encoding[z]

        mismatched_pairs = compare_dictionaries(encoding1, encoding2)

        # print(f"### Mismatched key-value pairs for layer {z}:\n")

        if len(mismatched_pairs) != 0:
            error_count += 1
            # print("  No mismatched pairs found\n")
        else:
            for key, value1, value2 in mismatched_pairs:
                print(f"#### Key {key}:\n")
                print("  Incorrect value:\n`", value1, "`\n")
                print("  Correct value:\n`", value2, "`\n")
                print()

print(f"Error count: {error_count}\n")


Error count: 0



In [44]:
df_catalogue["input_graph_encoding"] = extracted_data.input_graph_encoding
df_catalogue["inference_graph_encoding"] = extracted_data.inference_graph_encoding

In [45]:
df_catalogue.head(5)

Unnamed: 0,input_graph_file,inference_file,graph_type,input_graph_encoding,inference_graph_encoding
0,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,www.Department,"{1: [(-1, 1)], 2: [(-1, -2)], 3: [(-3, -2), (-...","{1: [(-1, 2), (-682, 2), (-683, 2), (-684, 2),..."
1,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,AssistantProfessor,"{1: [(-1, 3)], 7: [(-1, -2), (-3, -2), (-4, -2...","{1: [(-1, 4), (-1, 5), (-1, 6), (-1, 7), (-19,..."
2,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,Publication,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...","{1: [(-2, 6), (-3, 6), (-4, 6)]}"
3,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,Publication,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2)]}","{1: [(-2, 6)]}"
4,data/lubm1_intact/graphs_with_descriptions/HTT...,data/lubm1_intact/jena_inference_with_descript...,Publication,"{1: [(-1, 8)], 4: [(-1, -2)], 11: [(-1, -2), (...","{1: [(-2, 6), (-3, 6)]}"


# Create graph words

In [46]:
import numpy as np

In [47]:
active_properties_size = len(graph_words_encoder.active_properties)

In [48]:
graph_words_array = np.zeros((len(encodings), active_properties_size), dtype=np.int16)
inference_graph_words_array = np.zeros((len(encodings_inference), active_properties_size), dtype=np.int16)

In [49]:
graph_words_catalogue = ResourceDictionary()
for graph_index, graph_encoding in enumerate(encodings):
    for k in sorted(graph_encoding):
        p = graph_encoding[k]
        sorted_p = sorted(p, key=lambda element: (element[0], element[1]))
        tpl = tuple(sorted_p)
        graph_words_catalogue.add(tpl)
        graph_words_array[graph_index, k - 1] = graph_words_catalogue[tpl]

In [50]:
inference_graph_words_catalogue = ResourceDictionary()
for graph_index, graph_encoding in enumerate(encodings_inference):
    for k in sorted(graph_encoding):
        p = graph_encoding[k]
        sorted_p = sorted(p, key=lambda element: (element[0], element[1]))
        tpl = tuple(sorted_p)
        inference_graph_words_catalogue.add(tpl)
        inference_graph_words_array[graph_index, k - 1] = inference_graph_words_catalogue[tpl]

In [51]:
df_catalogue["input_graph_words"] = graph_words_array.tolist()
df_catalogue["inference_graph_words"] = inference_graph_words_array.tolist()

# Create matrix embedding

In [52]:
MAXIMUM_MATRIX_SIZE = 800
HOPE_EMBEDDING_SIZE = 4

In [53]:
matrix_embedding_catalogue = np.zeros(
    (len(graph_words_catalogue) + 1, MAXIMUM_MATRIX_SIZE * HOPE_EMBEDDING_SIZE))

In [54]:
from sklearn.decomposition import TruncatedSVD


def embed_layer_hope(layer, embedding_dim):
    beta = 0.01
    svd = TruncatedSVD(n_components=embedding_dim, random_state=42)
    M_g = np.eye(layer.shape[0]) - beta * layer
    M_l = beta * layer
    S = np.dot(np.linalg.inv(M_g), M_l)
    u = svd.fit_transform(S)
    vt = svd.components_
    s = svd.singular_values_
    X1 = np.dot(u, np.diag(np.sqrt(s)))
    X2 = np.dot(vt.T, np.diag(np.sqrt(s)))

    return X1, X2


def reconstruct_layer(u, v, alpha=0.5):
    r = np.dot(u, v.T)
    r[r >= alpha] = 1
    r[r < alpha] = 0
    return r

In [55]:
from tqdm import tqdm

for graph_word in tqdm(graph_words_catalogue):
    layer_adjacency_matrix = graph_words_encoder.layer_to_np(graph_word,
                                                             len(graph_words_encoder.active_classes),
                                                             MAXIMUM_MATRIX_SIZE)

    x1, x2 = embed_layer_hope(layer_adjacency_matrix, int(HOPE_EMBEDDING_SIZE / 2))
    reconstructed_layer = reconstruct_layer(x1 * 100, x2 * 100, 0.1)
    matrix_embedding_catalogue[graph_words_catalogue[graph_word]] = np.concatenate(
        (x1.flatten() * 100, x2.flatten() * 100), axis=0)
    if not np.allclose(layer_adjacency_matrix, reconstructed_layer):
        print("Reconstruction layer failed, increase hope embedding size. Current embedding size = %",
              HOPE_EMBEDDING_SIZE)

100%|██████████| 130/130 [01:34<00:00,  1.37it/s]


In [56]:
print(f"Loaded LUBM1 graph words dataset of size {len(df_catalogue)}")

Loaded LUBM1 graph words dataset of size 17174


In [57]:
from sklearn.model_selection import train_test_split

In [58]:
def train_validate_test_split(df, train_percent=0.6, validate_percent=0.2, stratify=None, seed=1):
    val_test_percent = 1 - train_percent
    test_percent = (1 - (train_percent + validate_percent))
    test_percent = test_percent / (test_percent + validate_percent)
    if stratify:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed,
                                                 stratify=df[stratify])
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed,
                                           stratify=df_val_test[stratify])
    else:
        df_train, df_val_test = train_test_split(df, test_size=val_test_percent, random_state=seed)
        df_val, df_test = train_test_split(df_val_test, test_size=test_percent, random_state=seed)
    return df_train, df_val, df_test

In [59]:
TRAINING_SET_PERCENT = 0.6
VALIDATION_SET_PERCENT = 0.2

rdf_data_train, rdf_data_val, rdf_data_test = train_validate_test_split(df_catalogue,
                                                                        train_percent=TRAINING_SET_PERCENT,
                                                                        validate_percent=VALIDATION_SET_PERCENT,
                                                                        stratify="graph_type")

In [60]:
print(
    f"Splitting dataset into train, validation and test sets with {TRAINING_SET_PERCENT * 100}%, {VALIDATION_SET_PERCENT * 100}% and {100 - TRAINING_SET_PERCENT * 100 - VALIDATION_SET_PERCENT * 100}% respectively")

Splitting dataset into train, validation and test sets with 60.0%, 20.0% and 20.0% respectively


In [61]:
def to_categorical(y, num_classes=None):
    """Converts a class vector (integers) to binary class matrix.
    E.g. for use with categorical_crossentropy.
    # Arguments
        y: class vector to be converted into a matrix
            (integers from 0 to num_classes).
        num_classes: total number of classes.
    # Returns
        A binary matrix representation of the input.
    """
    y = np.array(y, dtype=np.int)
    input_shape = y.shape
    if input_shape and input_shape[-1] == 1 and len(input_shape) > 1:
        input_shape = tuple(input_shape[:-1])
    y = y.ravel()
    if not num_classes:
        num_classes = np.max(y) + 1
    n = y.shape[0]
    categorical = np.zeros((n, num_classes), dtype=np.bool)
    categorical[np.arange(n), y] = 1
    output_shape = input_shape + (num_classes,)
    categorical = np.reshape(categorical, output_shape)
    return categorical

In [62]:
def get_target_vocab_size(df_train):
    y = df_train['inference_graph_words']
    y = np.stack(y)
    return y.max() + 1

In [63]:
def create_input_target_arrays(rdf_dataframe, embed_matrix, vocab_size):
    x_input = rdf_dataframe['input_graph_words']
    x_input = np.stack(x_input)
    x_input = embed_matrix[x_input]
    y_target = rdf_dataframe['inference_graph_words']
    y_target = np.stack(y_target)
    y_target_categorical = to_categorical(y_target, vocab_size)
    return x_input, y_target_categorical

In [64]:
from keras.layers import Input, Dense, Dropout, Bidirectional, GRU, RepeatVector, TimeDistributed
from keras.models import Model

2023-05-06 21:08:26.768077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-06 21:08:27.328175: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-06 21:08:27.328192: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-05-06 21:08:28.586032: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [68]:
from nn_utils import CSVLoggerTimed, to_categorical, true_acc
from keras.callbacks import TensorBoard, ModelCheckpoint

In [69]:
def create_graph_words_translation_model(x, y):
    graph_input = Input(shape=(x.shape[1], x.shape[2]), name='input_graph_words_sequence')
    graph_input_dense = Dense(256, name='graph_input_dense')(graph_input)
    graph_dropout1 = Dropout(0.2, name='graph_dropout1')(graph_input_dense)
    graph_gru = Bidirectional(GRU(128, name="gru_sequence_encoder"), name='bidirectional')(graph_dropout1)
    graph_dropout2 = Dropout(0.2, name='graph_dropout2')(graph_gru)
    hidden_graph = RepeatVector(y.shape[1], name="repeat_vector")(graph_dropout2)
    inference_output = GRU(128, return_sequences=True, name='sequence_decoder')(hidden_graph)
    inference_output = Dropout(0.2, name="output_dropout")(inference_output)
    inference_output = TimeDistributed(Dense(y.shape[2], name="softmax_layer", activation='softmax'),
                                       name="inference_graph_words_sequence")(inference_output)
    inference_model = Model(graph_input, inference_output)
    inference_model.compile('adam', 'categorical_crossentropy', metrics=['accuracy', true_acc])
    return inference_model

In [70]:
# matrix_embedding_catalogue
vocab_size = get_target_vocab_size(rdf_data_train)
x_train, y_train = create_input_target_arrays(rdf_data_train, matrix_embedding_catalogue, vocab_size)
x_val, y_val = create_input_target_arrays(rdf_data_val, matrix_embedding_catalogue, vocab_size)
x_test, y_test = create_input_target_arrays(rdf_data_test, matrix_embedding_catalogue, vocab_size)

inference_model = create_graph_words_translation_model(x_train, y_train)

In [71]:
import logging

logging.info("Created NN model: ")
inference_model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_graph_words_sequence   [(None, 18, 3200)]       0         
 (InputLayer)                                                    
                                                                 
 graph_input_dense (Dense)   (None, 18, 256)           819456    
                                                                 
 graph_dropout1 (Dropout)    (None, 18, 256)           0         
                                                                 
 bidirectional (Bidirectiona  (None, 256)              296448    
 l)                                                              
                                                                 
 graph_dropout2 (Dropout)    (None, 256)               0         
                                                                 
 repeat_vector (RepeatVector  (None, 18, 256)          0   

In [72]:
EPOCHS = 10  # 200
BATCH_SIZE = 32

In [73]:
LOGGING_FOLDER = "logs"
MODEL_FOLDER = "models"

In [74]:
csv_logger = CSVLoggerTimed(LOGGING_FOLDER + '/training.csv', separator=',', append=True)
tensorboard = TensorBoard(log_dir=LOGGING_FOLDER)
modelcheckpoint = ModelCheckpoint(MODEL_FOLDER + '/best_model', monitor='val_loss', verbose=0, save_best_only=True,
                                  save_weights_only=True, mode='auto', period=1)





In [1]:
logging.info("Starting training for %s epochs ", EPOCHS)
inference_model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=EPOCHS, batch_size=BATCH_SIZE,
                    callbacks=[csv_logger, tensorboard, modelcheckpoint])
logging.info("Finished training")


NameError: name 'logging' is not defined

In [None]:
logging.info("Evaluating on test set")
test_eval = inference_model.evaluate(x_test, y_test)
logging.info("Test set accuracy: %s", test_eval[inference_model.metrics_names.index('true_acc')])