In [None]:
###Version 3

In [29]:
# List of class names to check
# initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
initial_class_names_to_check = [ 'biochemicalreaction']

In [30]:
# List of ontology files to process
ontology_files = [
    # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    # "../Ontologies/nfdicore_2.ttl",
    # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/owlapi.xrdf",
    # "../Ontologies/schemaorg.owl",
    # "../Ontologies/MaterialsMine.xrdf",
    # '../Ontologies/emmo.owl', ### has problem of reading file
    # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # "../Ontologies/oboe.owl",
    # Add more file paths as needed
]

In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# dcterms = Namespace("http://purl.org/dc/terms/")
# isPartOf = dcterms.isPartOf


ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
# isPartOf = Namespace("http://semanticscience.org/resource/isPartOf")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

# print(isPartOf)

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_description(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return descriptions[0] if descriptions else None


def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    DCTERMS = Namespace("http://purl.org/dc/terms/")
    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        description = get_class_description(g, cls)
        for label in labels:
            normalized_label = normalize_string(label)
            found_class_labels.add(normalized_label)
            if normalized_label in normalized_class_names_to_check:
                data.append([file_path, str(cls), str(label), str(description)])
                for obj in g.objects(cls, RDFS.subClassOf):
                    obj_label = get_class_label(g, obj)
                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label)])
                for obj in g.objects(cls, OWL.equivalentClass):
                    obj_label = get_class_label(g, obj)
                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label)])
                for obj in g.objects(cls, DCTERMS.isPartOf):
                    obj_label = get_class_label(g, obj)
                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_description(g, URIRef(relation[1]))
                object_description = get_class_description(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}

    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, Graph(), writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")



In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_description(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return descriptions[0] if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        description = get_class_description(g, cls)
        for label in labels:
            normalized_label = normalize_string(label)
            found_class_labels.add(normalized_label)
            if normalized_label in normalized_class_names_to_check:
                data.append([file_path, str(cls), str(label), str(description)])
                for obj in g.objects(cls, RDFS.subClassOf):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                for obj in g.objects(cls, OWL.equivalentClass):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                for obj in g.objects(cls, DCTERMS.isPartOf):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_description(g, URIRef(relation[1]))
                object_description = get_class_description(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}

    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, Graph(), writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_description(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return descriptions[0] if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        description = get_class_description(g, cls)
        for label in labels:
            normalized_label = normalize_string(label)
            found_class_labels.add(normalized_label)
            if normalized_label in normalized_class_names_to_check:
                data.append([file_path, str(cls), str(label), str(description)])
                for obj in g.objects(cls, RDFS.subClassOf):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                for obj in g.objects(cls, OWL.equivalentClass):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                for obj in g.objects(cls, DCTERMS.isPartOf):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_description(g, URIRef(relation[1]))
                object_description = get_class_description(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}

    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, Graph(), writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_description(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return descriptions[0] if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        description = get_class_description(g, cls)
        for label in labels:
            normalized_label = normalize_string(label)
            found_class_labels.add(normalized_label)
            if normalized_label in normalized_class_names_to_check:
                data.append([file_path, str(cls), str(label), str(description)])
                for obj in g.objects(cls, RDFS.subClassOf):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                for obj in g.objects(cls, OWL.equivalentClass):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                for obj in g.objects(cls, DCTERMS.isPartOf):
                    obj_label = get_class_label(g, obj)
                    obj_description = get_class_description(g, obj)
                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_description(g, URIRef(relation[1]))
                object_description = get_class_description(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}

    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, Graph(), writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
#### removing blank nodes

In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_description(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return descriptions[0] if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_description(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_description(g, obj)
                            relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_description(g, obj)
                            relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_description(g, obj)
                            relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_description(g, URIRef(relation[1]))
                object_description = get_class_description(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}

    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, Graph(), writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_description(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_description(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_description(g, obj)
                            relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_description(g, obj)
                            relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_description(g, obj)
                            relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

# def print_hierarchy(class_name, relations, g, writer):
#     def recursive_print(class_name, depth=0):
#         for relation in relations:
#             if normalize_string(relation[2]) == normalize_string(class_name):
#                 subject_description = get_class_description(g, URIRef(relation[1]))
#                 object_description = get_class_description(g, URIRef(relation[4]))
#                 writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
#                 indent = '  ' * depth
#                 print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
#                 recursive_print(relation[5], depth + 1)

#     recursive_print(class_name)

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_description(g, URIRef(relation[1]))
                object_description = get_class_description(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)
    recursive_print(class_name)

    

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

# max_iterations = 2
# iteration_count = 0

max_iterations = 2
iteration_count = 0
g = Graph()  # Initialize the RDF graph here

# while class_names_to_check and iteration_count < max_iterations:
#     iteration_count += 1
#     new_data = []
#     new_relations = []
#     new_found_class_labels = set()

#     for ontology_file in ontology_files:
#         file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
#         new_data.extend(file_data)
#         new_relations.extend(file_relations)
#         new_found_class_labels.update(found_class_labels)

#     all_data.extend(new_data)
#     all_relations.extend(new_relations)
#     all_found_class_labels.update(new_found_class_labels)

#     class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}

#     class_names_to_check = [normalize_string(name) for name in class_names_to_check]

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]


# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

# print("\nSaving class hierarchy to CSV file:")
# with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
#     writer = csv.writer(file)
#     writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
#     for class_name in initial_class_names_to_check:
#         normalized_class_name = normalize_string(class_name)
#         print_hierarchy(normalized_class_name, all_relations, g, writer)

# print(f"Class hierarchy has been saved to {output_hierarchy_file}")


# Save class hierarchy to CSV file
print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)
print(f"Class hierarchy has been saved to {output_hierarchy_file}")

In [None]:
###Finally fixed!

In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g = Graph()
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels, g

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
graph = None

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels, g = load_and_collect_classes_and_relations(ontology_file, class_names_to_check)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    graph = g  # Ensure we keep the last loaded graph for hierarchy printing

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels, _ = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, graph, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
### above code works only for single input file to capture descriptions

In [None]:
### here I try to fix the above isuue

In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces (unchanged)

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
### above code get classes several times and iterate several times!

In [None]:
### Here the issue is solved :)

In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    # Check if already processed in this iteration
                    if str(cls) not in processed_classes:
                        processed_classes.add(str(cls))
                        data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

# Filter and save class data
filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Class URI", "Label", "Description"])
    writer.writerows(filtered_data)

# Filter and save class relations
filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
    writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)


print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    # Check if already processed in this iteration
                    if str(cls) not in processed_classes:
                        processed_classes.add(str(cls))
                        data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)


print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
#### extra feature:   adding -----------  after finding a new matched class to csv files

In [None]:
# def write_class_data(file, data, initial_class_names_to_check):
#     writer = csv.writer(file)
#     writer.writerow(["File", "Class URI", "Label", "Description"])
#     last_class_name = None
#     for row in data:
#         class_name = row[2]
#         normalized_class_name = normalize_string(class_name)
#         if last_class_name is None or normalized_class_name in [normalize_string(name) for name in initial_class_names_to_check]:
#             if last_class_name is not None:
#                 writer.writerow(["------", "------", "------", "------"])  # Insert separator
#             last_class_name = normalized_class_name
#         writer.writerow(row)

# def write_relations_data(file, relations, initial_class_names_to_check):
#     writer = csv.writer(file)
#     writer.writerow(["File", "Subject Class URI", "Subject Label", "Relation", "Object Class URI", "Object Label", "Object Description"])
#     last_class_name = None
#     for relation in relations:
#         subject_label = relation[2]
#         object_label = relation[5]
#         if last_class_name is None or normalize_string(subject_label) in [normalize_string(name) for name in initial_class_names_to_check]:
#             if last_class_name is not None:
#                 writer.writerow(["------", "------", "------", "------", "------", "------", "------"])  # Insert separator
#             last_class_name = normalize_string(subject_label)
#         writer.writerow(relation)

# def write_hierarchy_data(file, initial_class_names_to_check, all_relations, g):
#     writer = csv.writer(file)
#     writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
#     last_class_name = None
#     for class_name in initial_class_names_to_check:
#         normalized_class_name = normalize_string(class_name)
#         if last_class_name is None or normalized_class_name in [normalize_string(name) for name in initial_class_names_to_check]:
#             if last_class_name is not None:
#                 writer.writerow(["------", "------", "------", "------", "------", "------", "------", "------"])  # Insert separator
#             last_class_name = normalized_class_name
#         print_hierarchy(normalized_class_name, all_relations, g, writer)

# # Save class data
# filtered_data = [row for row in all_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
# with open(class_output_file, mode='w', newline='', encoding='utf-8') as file:
#     write_class_data(file, filtered_data, initial_class_names_to_check)

# # Save class relations
# filtered_relations = filter_relations(all_relations, initial_class_names_to_check)
# with open(relations_output_file, mode='w', newline='', encoding='utf-8') as file:
#     write_relations_data(file, filtered_relations, initial_class_names_to_check)

# # Save class hierarchy
# with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
#     write_hierarchy_data(file, initial_class_names_to_check, all_relations, g)

# print(f"Filtered class data has been saved to {class_output_file}")
# print(f"Filtered class relations have been saved to {relations_output_file}")
# print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [None]:
#### Find the childeren!!!

In [None]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    # Check if already processed in this iteration
                    if str(cls) not in processed_classes:
                        processed_classes.add(str(cls))
                        data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                        elif (obj, RDF.type, OWL.Restriction) in g:
                            # Handle the case where equivalentClass is a Restriction
                            on_property = list(g.objects(obj, OWL.onProperty))
                            some_values_from = list(g.objects(obj, OWL.someValuesFrom))
                            all_values_from = list(g.objects(obj, OWL.allValuesFrom))
                            if on_property and (some_values_from or all_values_from):
                                on_property_label = get_class_label(g, on_property[0]) if on_property else None
                                some_values_from_label = get_class_label(g, some_values_from[0]) if some_values_from else None
                                all_values_from_label = get_class_label(g, all_values_from[0]) if all_values_from else None
                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(on_property[0])) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(on_property[0])))
                                    relations.append([
                                        file_path, str(cls), str(label), 'equivalentClass', 
                                        f"Restriction on {on_property[0]} (onProperty: {on_property_label}, someValuesFrom: {some_values_from_label}, allValuesFrom: {all_values_from_label})", 
                                        "", ""
                                    ])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 3
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


In [4]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    # Check if already processed in this iteration
                    if str(cls) not in processed_classes:
                        processed_classes.add(str(cls))
                        data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label), str(obj_description)])
                        elif (obj, RDF.type, OWL.Restriction) in g:
                            # Handle the case where equivalentClass is a Restriction
                            on_property = list(g.objects(obj, OWL.onProperty))
                            some_values_from = list(g.objects(obj, OWL.someValuesFrom))
                            all_values_from = list(g.objects(obj, OWL.allValuesFrom))
                            if on_property and (some_values_from or all_values_from):
                                on_property_label = get_class_label(g, on_property[0]) if on_property else None
                                some_values_from_label = get_class_label(g, some_values_from[0]) if some_values_from else None
                                all_values_from_label = get_class_label(g, all_values_from[0]) if all_values_from else None
                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(on_property[0])) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(on_property[0])))
                                    relations.append([
                                        file_path, str(cls), str(label), 'equivalentClass', 
                                        f"Restriction on {on_property[0]} (onProperty: {on_property_label}, someValuesFrom: {some_values_from_label}, allValuesFrom: {all_values_from_label})", 
                                        "", ""
                                    ])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 3
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'Tensiletest' found in:
- ../Ontologies/materialsmine_converted.ttl


Restriction on https://w3id.org/pmd/co/participant (onProperty: has participant, someValuesFrom: Tensile Testing Machine, allValuesFrom: None) does not look like a valid URI, trying to serialize this will break.


- ../Ontologies/pmdco_core.ttl

Saving class hierarchy to CSV file:
https://w3id.org/pmd/co/TensileTest Tensile Test is subClassOf https://w3id.org/pmd/co/MechanicalTestingProcess Mechanical Testing Process (from ../Ontologies/pmdco_core.ttl)
  https://w3id.org/pmd/co/MechanicalTestingProcess Mechanical Testing Process is subClassOf https://w3id.org/pmd/co/AnalysingProcess Analyseprozess (from ../Ontologies/materialsmine_converted.ttl)
    https://w3id.org/pmd/co/AnalysingProcess Analyseprozess is subClassOf https://w3id.org/pmd/co/Process Process (from ../Ontologies/materialsmine_converted.ttl)
      https://w3id.org/pmd/co/Process Process is subClassOf http://www.w3.org/ns/prov#Activity None (from ../Ontologies/materialsmine_converted.ttl)
      http://semanticscience.org/resource/Process process is subClassOf http://semanticscience.org/resource/Entity entity (from ../Ontologies/materialsmine_converted.ttl)
        http://semanticscience.org/resource/Entity entity is subClassOf http:

In [5]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    if s is None:
        return ''
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions]) if descriptions else None

def parse_equivalent_class(g, equivalent_class):
    if isinstance(equivalent_class, BNode):
        if (equivalent_class, RDF.type, OWL.Restriction) in g:
            on_property = g.value(equivalent_class, OWL.onProperty)
            some_values_from = g.value(equivalent_class, OWL.someValuesFrom)
            all_values_from = g.value(equivalent_class, OWL.allValuesFrom)
            return f"Restriction (onProperty: {on_property}, someValuesFrom: {some_values_from}, allValuesFrom: {all_values_from})"
        elif (equivalent_class, RDF.type, OWL.Class) in g:
            union_of = g.value(equivalent_class, OWL.unionOf)
            if union_of:
                union_classes = list(union_of)
                return f"Union of: ({', '.join(str(cls) for cls in union_classes)})"
    else:
        return str(equivalent_class)

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                normalized_label = normalize_string(label)
                found_class_labels.add(normalized_label)
                if normalized_label in normalized_class_names_to_check:
                    # Check if already processed in this iteration
                    if str(cls) not in processed_classes:
                        processed_classes.add(str(cls))
                        data.append([file_path, str(cls), str(label), str(description)])
                    for obj in g.objects(cls, RDFS.subClassOf):
                        if isinstance(obj, URIRef):  # Check if the object is a URI
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label), str(obj_description)])
                    for obj in g.objects(cls, OWL.equivalentClass):
                        eq_class_desc = parse_equivalent_class(g, obj)
                        obj_label = get_class_label(g, obj) if isinstance(obj, URIRef) else None
                        obj_description = get_class_descriptions(g, obj) if isinstance(obj, URIRef) else None
                        # Check if already processed in this iteration
                        if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                            processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                            relations.append([file_path, str(cls), str(label), 'equivalentClass', eq_class_desc, obj_label, obj_description])
                    for obj in g.objects(cls, DCTERMS.isPartOf):
                        if isinstance(obj, URIRef):
                            obj_label = get_class_label(g, obj)
                            obj_description = get_class_descriptions(g, obj)
                            # Check if already processed in this iteration
                            if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label), str(obj_description)])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description, relation[3], relation[4], relation[5], object_description, relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 3
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'Tensiletest' found in:
- ../Ontologies/materialsmine_converted.ttl


Restriction (onProperty: https://w3id.org/pmd/co/participant, someValuesFrom: https://w3id.org/pmd/co/TensileTestingMachine, allValuesFrom: None) does not look like a valid URI, trying to serialize this will break.
Restriction (onProperty: https://w3id.org/pmd/co/participant, someValuesFrom: https://w3id.org/pmd/co/TestPiece, allValuesFrom: None) does not look like a valid URI, trying to serialize this will break.


- ../Ontologies/pmdco_core.ttl

Saving class hierarchy to CSV file:
https://w3id.org/pmd/co/TensileTest Tensile Test is subClassOf https://w3id.org/pmd/co/MechanicalTestingProcess Mechanical Testing Process (from ../Ontologies/pmdco_core.ttl)
  https://w3id.org/pmd/co/MechanicalTestingProcess Mechanical Testing Process is subClassOf https://w3id.org/pmd/co/AnalysingProcess Analyseprozess (from ../Ontologies/materialsmine_converted.ttl)
    https://w3id.org/pmd/co/AnalysingProcess Analyseprozess is subClassOf https://w3id.org/pmd/co/Process Process (from ../Ontologies/materialsmine_converted.ttl)
      https://w3id.org/pmd/co/Process Process is subClassOf http://www.w3.org/ns/prov#Activity None (from ../Ontologies/materialsmine_converted.ttl)
      http://semanticscience.org/resource/Process process is subClassOf http://semanticscience.org/resource/Entity entity (from ../Ontologies/materialsmine_converted.ttl)
        http://semanticscience.org/resource/Entity entity is subClassOf http:

In [24]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None

def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        union = list(g.objects(node, OWL.unionOf))
        if union:
            members = list(g.items(union[0]))
            labels = [get_class_label(g, m) for m in members if get_class_label(g, m) is not None]
            return f"Union of {' and '.join(labels)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)
                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])
                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 3
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'Tensiletest' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
Class 'biochemicalreaction' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl

Saving class hierarchy to CSV file:


Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


https://w3id.org/pmd/co/TensileTest Tensile Test is subClassOf https://w3id.org/pmd/co/MechanicalTestingProcess Mechanical Testing Process (from ../Ontologies/pmdco_core.ttl)
  https://w3id.org/pmd/co/MechanicalTestingProcess Mechanical Testing Process is subClassOf https://w3id.org/pmd/co/AnalysingProcess Analyseprozess (from ../Ontologies/materialsmine_converted.ttl)
    https://w3id.org/pmd/co/AnalysingProcess Analyseprozess is subClassOf https://w3id.org/pmd/co/Process Process (from ../Ontologies/materialsmine_converted.ttl)
      https://w3id.org/pmd/co/Process Process is subClassOf http://www.w3.org/ns/prov#Activity  (from ../Ontologies/materialsmine_converted.ttl)
      http://semanticscience.org/resource/Process process is subClassOf http://semanticscience.org/resource/Entity entity (from ../Ontologies/materialsmine_converted.ttl)
        http://semanticscience.org/resource/Entity entity is subClassOf http://www.w3.org/2002/07/owl#Thing  (from ../Ontologies/materialsmine_conver

In [None]:
#### funtion to capture intersectionof

In [45]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None

def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None



def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)
                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])
                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            # Check for owl:intersectionOf
            intersections = list(g.objects(cls, OWL.intersectionOf))
            if intersections:
                for intersection in intersections:
                    if isinstance(intersection, BNode):
                        components = []
                        for item in g.items(intersection):
                            if isinstance(item, URIRef):
                                component_label = get_class_label(g, item)
                                if component_label:
                                    components.append(component_label)
                            elif isinstance(item, BNode):
                                restriction_label = get_complex_expression_label(g, item)
                                if restriction_label:
                                    components.append(restriction_label)
                        if components:
                            data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 3
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(intersection_data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        writer.writerows(intersection_data)


# Inside your main loop where you process ontology files:
# Add after collecting data and relations
intersection_data = []
for file_path in ontology_files:
    file_data, _, _ = load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations)
    intersection_data.extend(file_data)

save_intersection_info_to_csv(intersection_data, "intersection_info.csv")



Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'biochemicalreaction' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl

Saving class hierarchy to CSV file:
http://semanticscience.org/resource/BiochemicalReaction biochemical reaction is subClassOf http://semanticscience.org/resource/CatalyzedReaction catalyzed reaction (from ../Ontologies/materialsmine_converted.ttl)
  http://semanticscience.org/resource/CatalyzedReaction catalyzed reaction is subClassOf http://semanticscience.org/resource/ChemicalReaction chemical reaction (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/resource/ChemicalReaction chemical reaction is subClassOf http://semanticscience.org/resource/ChemicalInteraction chemical interaction (from ../Ontologies/materialsmine_converted.ttl)
      http://semanticscience.org/resource/Che

In [22]:
##### Bnode(); not working correctly

In [20]:
import csv
import re
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode

# Define namespaces
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None

def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        union = list(g.objects(node, OWL.unionOf))
        if union:
            members = list(g.items(union[0]))
            labels = [get_class_label(g, m) for m in members if get_class_label(g, m) is not None]
            return f"Union of {' and '.join(labels)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()
    for cls in classes:
        if isinstance(cls, (URIRef, BNode)):  # Check if the subject is a URI or BNode
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)
            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)
                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])
                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, (URIRef, BNode)):  # Check if the object is a URI or BNode
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, (URIRef, BNode)):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, (URIRef, BNode)):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)
                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

    return data, relations, found_class_labels

def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]

def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 3
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")


Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'biochemicalreaction' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl

Saving class hierarchy to CSV file:
http://semanticscience.org/resource/BiochemicalReaction biochemical reaction is subClassOf http://semanticscience.org/resource/CatalyzedReaction catalyzed reaction (from ../Ontologies/materialsmine_converted.ttl)
  http://semanticscience.org/resource/CatalyzedReaction catalyzed reaction is subClassOf http://semanticscience.org/resource/ChemicalReaction chemical reaction (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/resource/ChemicalReaction chemical reaction is subClassOf http://semanticscience.org/resource/ChemicalInteraction chemical interaction (from ../Ontologies/materialsmine_converted.ttl)
      http://semanticscience.org/resource/Che

In [16]:
#### Testing
### getting more info from the ontology

In [15]:
from rdflib import Graph, Namespace, RDF, RDFS, OWL, URIRef, BNode
import csv

# Define the namespaces
SIO = Namespace("http://semanticscience.org/resource/")
CO = Namespace("https://w3id.org/pmd/co/")
EX = Namespace("http://example.org/")

# Initialize the graph
g = Graph()

# Parse the Turtle files
ttl_data_1 = """@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix dc11: <http://purl.org/dc/elements/1.1/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bibo: <http://purl.org/ontology/bibo/> .
@prefix vann: <http://purl.org/vocab/vann/> .
@prefix schema: <http://schema.org/> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix vs: <http://www.w3.org/2003/06/sw-vocab-status/ns#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix prov: <http://www.w3.org/ns/prov#> .
@prefix foaf: <http://xmlns.com/foaf/0.1/> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix unit: <http://qudt.org/vocab/unit/> .

<http://semanticscience.org/resource/Biopolymer> a owl:Class ;
	owl:equivalentClass [
		rdf:type owl:Class ;
		owl:intersectionOf (
			<http://semanticscience.org/resource/OrganicPolymer>
			[
				rdf:type owl:Restriction ;
				owl:onProperty <http://semanticscience.org/resource/hasDirectPart> ;
				owl:someValuesFrom [
					rdf:type owl:Class ;
					owl:unionOf (
						<http://semanticscience.org/resource/AminoAcidResidue>
						<http://semanticscience.org/resource/CarbohydrateResidue>
						<http://semanticscience.org/resource/LipidResidue>
						<http://semanticscience.org/resource/NucleotideResidue>
					) ;
				] ;
			]
		) ;
	] ;
	rdfs:subClassOf <http://semanticscience.org/resource/OrganicPolymer>, [
		rdf:type owl:Restriction ;
		owl:onProperty <http://semanticscience.org/resource/hasDirectPart> ;
		owl:someValuesFrom [
			rdf:type owl:Class ;
			owl:unionOf (
				<http://semanticscience.org/resource/AminoAcidResidue>
				<http://semanticscience.org/resource/CarbohydrateResidue>
				<http://semanticscience.org/resource/LipidResidue>
				<http://semanticscience.org/resource/NucleotideResidue>
			) ;
		] ;
	] ;
	<http://data.bioontology.org/metadata/prefixIRI> "sio:Biopolymer" ;
	dcterms:description "A biopolymer is an organic polymer that are typically produced by the cells of living organisms."@en ;
	rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
	rdfs:label "biopolymer"@en .
"""

ttl_data_2 = """@prefix : <https://w3id.org/pmd/co/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@base <https://w3id.org/pmd/co/> .

:ISO6892-1TensileTest rdf:type owl:Class ;
                      owl:equivalentClass [ rdf:type owl:Restriction ;
                                            owl:onProperty :participant ;
                                            owl:someValuesFrom :Extensometer
                                          ] ,
                                          [ rdf:type owl:Restriction ;
                                            owl:onProperty :participant ;
                                            owl:someValuesFrom :LoadCell
                                          ] ,
                                          [ rdf:type owl:Restriction ;
                                            owl:onProperty :participant ;
                                            owl:someValuesFrom :TensileTestingMachine
                                          ] ,
                                          [ rdf:type owl:Restriction ;
                                            owl:onProperty :participant ;
                                            owl:someValuesFrom :TestPiece
                                          ] ;
                      rdfs:subClassOf :TensileTest ;
                      rdfs:isDefinedBy <https://w3id.org/pmd/co> ;
                      rdfs:label "Tensile Test in accordance with ISO 6892-1"@en ,
                                 "Zugversuch nach ISO 6892-1"@de .
"""

g.parse(data=ttl_data_1, format='ttl')
g.parse(data=ttl_data_2, format='ttl')

# Helper functions to extract details
def get_restriction_details(node):
    details = {}
    for p, o in g.predicate_objects(node):
        if p == OWL.onProperty:
            details['onProperty'] = get_readable_label(o)
        elif p == OWL.someValuesFrom:
            if isinstance(o, URIRef):
                details['someValuesFrom'] = get_readable_label(o)
            elif isinstance(o, BNode):
                details['someValuesFrom'] = get_union_or_intersection(o)
    return details

def get_union_or_intersection(node):
    items = []
    for p, o in g.predicate_objects(node):
        if p in (OWL.unionOf, OWL.intersectionOf):
            for item in g.items(o):
                items.append(get_readable_label(item))
    return items

# Function to get a readable label for a URI or BNode
def get_readable_label(node):
    label = g.value(subject=node, predicate=RDFS.label)
    if label:
        return str(label)
    elif isinstance(node, URIRef):
        return str(node)
    else:
        return str(node)

# Extract information and save to a list of dictionaries
class_info_list = []

def extract_class_info(class_uri):
    class_info = {"Class": get_readable_label(class_uri)}
    
    for _, p, o in g.triples((class_uri, None, None)):
        if p == OWL.equivalentClass or p == RDFS.subClassOf:
            if isinstance(o, BNode):
                for _, p2, o2 in g.triples((o, None, None)):
                    if p2 == OWL.intersectionOf or p2 == OWL.unionOf:
                        members = [get_readable_label(member) for member in g.items(o2)]
                        class_info[p2.split('#')[-1]] = ', '.join(members)
                    elif p2 == RDF.type and o2 == OWL.Restriction:
                        restriction_details = get_restriction_details(o)
                        class_info['Restriction'] = ', '.join([f"{k}: {v}" for k, v in restriction_details.items()])
                        
                    if isinstance(o2, BNode):
                        restriction_details = get_restriction_details(o2)
                        class_info['Nested Restriction'] = ', '.join([f"{k}: {v}" for k, v in restriction_details.items()])
        else:
            class_info[p.split('#')[-1]] = get_readable_label(o)
    
    class_info_list.append(class_info)

# Query for all classes in the graph
for class_uri in g.subjects(RDF.type, OWL.Class):
    extract_class_info(class_uri)

# Determine fieldnames dynamically
all_keys = set()
for class_info in class_info_list:
    all_keys.update(class_info.keys())

fieldnames = list(all_keys)

# Write the extracted information to a CSV file
csv_file = "class_info.csv"

with open(csv_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    for class_info in class_info_list:
        writer.writerow(class_info)

print(f"Class information saved to {csv_file}")

Class information saved to class_info.csv
