In [59]:
### Starting v4

In [8]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'Compression' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/nfdicore_2.ttl
Class 'AmperePerJoule' not found in the output.
Class 'nfdi' not found in the output.
Class 'stress' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/nfdicore_2.ttl
Class 'Advertiser+content_Article' not found in the output.
Class 'Tensiletest' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/nfdicore_2.ttl

Saving class hierarchy to CSV file:
http://materialsmine.org/ns/Compression Compression is subClassOf http://materialsmine.org/ns/ViscoelasticProperty ['../Ontologies/materialsmine_converted.ttl', 'http://materialsmine.org/ns/Compression', 'Compression', 'subClassOf', 'http://mater

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


                  https://w3id.org/pmd/co/Object Object is subClassOf http://www.w3.org/ns/prov#Entity ['../Ontologies/materialsmine_converted.ttl', 'https://w3id.org/pmd/co/Object', 'Object', 'subClassOf', 'http://www.w3.org/ns/prov#Entity', '', '']  (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/resource/Quantity Amount is equivalentClass Complex class expression ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/Quantity', 'Amount', 'equivalentClass', 'Complex class expression', '', 'Complex class expression']  (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/resource/Quantity Amount is equivalentClass Complex class expression ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/Quantity', 'Amount', 'equivalentClass', 'Complex class expression', '', 'Complex class expression']  (from ../Ontologies/materialsmine_converted.ttl)
http://materialsmine.org/ns/Stres

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


                    http://semanticscience.org/resource/Entity entity is subClassOf http://www.w3.org/2002/07/owl#Thing ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/Entity', 'entity', 'subClassOf', 'http://www.w3.org/2002/07/owl#Thing', '', '']  (from ../Ontologies/materialsmine_converted.ttl)
                  https://w3id.org/pmd/co/Object Object is subClassOf http://www.w3.org/ns/prov#Entity ['../Ontologies/materialsmine_converted.ttl', 'https://w3id.org/pmd/co/Object', 'Object', 'subClassOf', 'http://www.w3.org/ns/prov#Entity', '', '']  (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/resource/Quantity Amount is equivalentClass Complex class expression ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/Quantity', 'Amount', 'equivalentClass', 'Complex class expression', '', 'Complex class expression']  (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/re

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


    https://w3id.org/pmd/co/AnalysingProcess Analyseprozess is subClassOf https://w3id.org/pmd/co/Process ['../Ontologies/materialsmine_converted.ttl', 'https://w3id.org/pmd/co/AnalysingProcess', 'Analyseprozess', 'subClassOf', 'https://w3id.org/pmd/co/Process', 'Process', 'A series of actions or operations conducing to an end\nIn PMD, a process is conducted via processing nodes and has a discernable duration as part of a workflow. A process consumes objects and parameters. A process potentially generates new objects and measurements. A process is either a transformative process or a non-transformative process with respect to objects processed via a processing node. There are primarily two types of distinguishable processes: manufacture process, analysis process. A process is a series of operations that are subordinate processes.'] Process (from ../Ontologies/materialsmine_converted.ttl)
      https://w3id.org/pmd/co/Process Process is subClassOf http://www.w3.org/ns/prov#Activity ['..

In [7]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


In [2]:
# List of class names to check
initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
# initial_class_names_to_check = [ 'TermVariant']

In [1]:
# List of ontology files to process
ontology_files = [
    # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    "../Ontologies/nfdicore_2.ttl",
    # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/owlapi.xrdf",
    # "../Ontologies/schemaorg.owl",
    # "../Ontologies/MaterialsMine.xrdf",
    # '../Ontologies/emmo.owl', ### has problem of reading file
    # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # "../Ontologies/oboe.owl",
    "../Ontologies/fabio.ttl",
    # Add more file paths as needed
]

In [3]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

In [10]:
def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


In [11]:
# def get_complex_expression_label(g, node):
#     if (node, RDF.type, OWL.Restriction) in g:
#         prop = list(g.objects(node, OWL.onProperty))
#         val = list(g.objects(node, OWL.someValuesFrom))
#         if prop and val:
#             prop_label = get_class_label(g, prop[0])
#             val_label = get_class_label(g, val[0])
#             return f"Restriction on {prop_label} some {val_label}"
#     elif (node, RDF.type, OWL.Class) in g:
#         intersection = list(g.objects(node, OWL.intersectionOf))
#         if intersection:
#             components = []
#             for item in g.items(intersection[0]):
#                 if isinstance(item, URIRef):
#                     component_label = get_class_label(g, item)
#                     if component_label:
#                         components.append(component_label)
#                 elif isinstance(item, BNode):
#                     restriction_label = get_complex_expression_label(g, item)
#                     if restriction_label:
#                         components.append(restriction_label)
#             if components:
#                 return f"Intersection of {' and '.join(components)}"
#     return None

def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        equivalent_classes = list(g.objects(node, OWL.equivalentClass))
        if equivalent_classes:
            for equivalent_class in equivalent_classes:
                if isinstance(equivalent_class, BNode):
                    intersections = list(g.objects(equivalent_class, OWL.intersectionOf))
                    if intersections:
                        components = []
                        for intersection in g.items(intersections[0]):
                            if isinstance(intersection, URIRef):
                                component_label = get_class_label(g, intersection)
                                if component_label:
                                    components.append(component_label)
                            elif isinstance(intersection, BNode):
                                restriction_labels = []
                                for restriction_item in g.items(intersection):
                                    if isinstance(restriction_item, BNode):
                                        restriction_label = get_complex_expression_label(g, restriction_item)
                                        if restriction_label:
                                            restriction_labels.append(restriction_label)
                                if restriction_labels:
                                    components.append("Intersection of " + " and ".join(restriction_labels))
                        if components:
                            return f"Equivalent to {' and '.join(components)}"
                else:
                    return get_complex_expression_label(g, equivalent_class)
    return None


In [20]:
# def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
#     if file_path.endswith('.ttl'):
#         file_format = 'ttl'
#     elif file_path.endswith('.owl'):
#         file_format = 'xml'
#     elif file_path.endswith('.xrdf'):
#         file_format = 'xml'
#     else:
#         raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
#     g.parse(file_path, format=file_format)

#     normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

#     classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

#     data = []
#     relations = []
#     found_class_labels = set()

#     for cls in classes:
#         if isinstance(cls, URIRef):  # Check if the subject is a URI
#             labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
#             description = get_class_descriptions(g, cls)

#             for label in labels:
#                 if label is not None:
#                     normalized_label = normalize_string(label)
#                     found_class_labels.add(normalized_label)

#                     if normalized_label in normalized_class_names_to_check:
#                         # Check if already processed in this iteration
#                         if str(cls) not in processed_classes:
#                             processed_classes.add(str(cls))
#                             data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

#                         for obj in g.objects(cls, RDFS.subClassOf):
#                             if isinstance(obj, URIRef):  # Check if the object is a URI
#                                 obj_label = get_class_label(g, obj)
#                                 obj_description = get_class_descriptions(g, obj)

#                                 # Check if already processed in this iteration
#                                 if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'subClassOf', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

#                         for obj in g.objects(cls, OWL.equivalentClass):
#                             if isinstance(obj, URIRef):
#                                 obj_label = get_class_label(g, obj)
#                                 obj_description = get_class_descriptions(g, obj)

#                                 # Check if already processed in this iteration
#                                 if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'equivalentClass', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
#                             else:
#                                 # Handle blank nodes for equivalentClass
#                                 obj_label = get_complex_expression_label(g, obj)
#                                 obj_description = "Complex class expression"

#                                 if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'equivalentClass', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

#                         for obj in g.objects(cls, DCTERMS.isPartOf):
#                             if isinstance(obj, URIRef):
#                                 obj_label = get_class_label(g, obj)
#                                 obj_description = get_class_descriptions(g, obj)

#                                 # Check if already processed in this iteration
#                                 if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'isPartOf', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

#             # Check for owl:intersectionOf
#             intersections = list(g.objects(cls, OWL.intersectionOf))
#             for intersection in intersections:
#                 if isinstance(intersection, BNode):
#                     components = []
#                     for item in g.items(intersection):
#                         if isinstance(item, URIRef):
#                             component_label = get_class_label(g, item)
#                             if component_label:
#                                 components.append(component_label)
#                         elif isinstance(item, BNode):
#                             restriction_labels = []
#                             for restriction_item in g.items(item):
#                                 if isinstance(restriction_item, BNode):
#                                     restriction_label = get_complex_expression_label(g, restriction_item)
#                                     if restriction_label:
#                                         restriction_labels.append(restriction_label)
#                             if restriction_labels:
#                                 components.append("Intersection of " + " and ".join(restriction_labels))
                    
#                     if components:
#                         data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

#     return data, relations, found_class_labels


def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                                    # Handle owl:intersectionOf
                                    intersections = list(g.objects(obj, OWL.intersectionOf))
                                    if intersections:
                                        components = []
                                        for intersection in g.items(intersections[0]):
                                            if isinstance(intersection, URIRef):
                                                component_label = get_class_label(g, intersection)
                                                if component_label:
                                                    components.append(component_label)
                                            elif isinstance(intersection, BNode):
                                                restriction_labels = []
                                                for restriction_item in g.items(intersection):
                                                    if isinstance(restriction_item, BNode):
                                                        restriction_label = get_complex_expression_label(g, restriction_item)
                                                        if restriction_label:
                                                            restriction_labels.append(restriction_label)
                                                if restriction_labels:
                                                    components.append("Intersection of " + " and ".join(restriction_labels))

                                        if components:
                                            data.append([file_path, str(cls), "", f"Equivalent to {' and '.join(components)}"])

                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                                    # Handle owl:intersectionOf
                                    intersections = list(g.objects(obj, OWL.intersectionOf))
                                    if intersections:
                                        components = []
                                        for intersection in g.items(intersections[0]):
                                            if isinstance(intersection, URIRef):
                                                component_label = get_class_label(g, intersection)
                                                if component_label:
                                                    components.append(component_label)
                                            elif isinstance(intersection, BNode):
                                                restriction_labels = []
                                                for restriction_item in g.items(intersection):
                                                    if isinstance(restriction_item, BNode):
                                                        restriction_label = get_complex_expression_label(g, restriction_item)
                                                        if restriction_label:
                                                            restriction_labels.append(restriction_label)
                                                if restriction_labels:
                                                    components.append("Intersection of " + " and ".join(restriction_labels))

                                        if components:
                                            data.append([file_path, str(cls), "", f"Equivalent to {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

    return data, relations, found_class_labels


In [21]:
def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


In [22]:
def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

In [23]:
output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

In [24]:
while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

In [28]:


print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

# def save_intersection_info_to_csv(data, output_file):
#     with open(output_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
#         for row in data:
#             writer.writerow(row)

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            if row[3].startswith("Intersection of"):
                writer.writerow(row)



# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'function' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/nfdicore_2.ttl
- ../Ontologies/fabio.ttl

Saving class hierarchy to CSV file:
http://semanticscience.org/resource/Function function is subClassOf http://semanticscience.org/resource/Capability capability (from ../Ontologies/materialsmine_converted.ttl)
  http://semanticscience.org/resource/Capability capability is subClassOf http://semanticscience.org/resource/RealizableEntity realizable entity (from ../Ontologies/materialsmine_converted.ttl)
    http://semanticscience.org/resource/RealizableEntity realizable entity is subClassOf http://semanticscience.org/resource/Attribute attribute (from ../Ontologies/materialsmine_converted.ttl)
      http://semanticscience.org/resource/Attribute attribute is subClass

In [26]:
# List of class names to check
# initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
initial_class_names_to_check = [ 'function']

In [27]:
# List of ontology files to process
ontology_files = [
    # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    "../Ontologies/nfdicore_2.ttl",
    # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/owlapi.xrdf",
    # "../Ontologies/schemaorg.owl",
    # "../Ontologies/MaterialsMine.xrdf",
    # '../Ontologies/emmo.owl', ### has problem of reading file
    # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # "../Ontologies/oboe.owl",
    "../Ontologies/fabio.ttl",
    # Add more file paths as needed
]

In [29]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'function' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/nfdicore_2.ttl
- ../Ontologies/fabio.ttl

Saving class hierarchy to CSV file:
http://semanticscience.org/resource/Function function is subClassOf http://semanticscience.org/resource/Capability ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/Function', 'function', 'subClassOf', 'http://semanticscience.org/resource/Capability', 'capability', 'A capability is a realizable entity whose basis lies in one or more parts or qualities and reflects possibility of an entity to behave in a specified way under certain conditions or in response to a certain stimulus (trigger).'] capability (from ../Ontologies/materialsmine_converted.ttl)
  http://semanticscience.org/resource/Capabilit

In [6]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode

# Define namespaces and helper functions here (same as in your original code)...

def process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file):
    

    directory = '.'
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            os.remove(os.path.join(directory, filename))


    # Define namespaces (assuming these are already defined in your script)
    ex = Namespace("http://example.org/ontology/")
    sio = Namespace("http://semanticscience.org/resource/")
    skos = Namespace("http://www.w3.org/2004/02/skos/core#")
    owl = Namespace("http://www.w3.org/2002/07/owl#")
    rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    materialsmine = Namespace("http://materialsmine.org/ns/")
    bibo = Namespace("http://purl.org/ontology/bibo/")
    rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
    xml = Namespace("http://www.w3.org/XML/1998/namespace")
    foaf = Namespace("http://xmlns.com/foaf/0.1/")
    dcterms = Namespace("http://purl.org/dc/terms/")
    isPartOf = dcterms.isPartOf
    DCTERMS = Namespace("http://purl.org/dc/terms/")

    def normalize_string(s):
        s = s.lower()
        s = re.sub(r'[_\-+\s]+', '', s)
        s = s.replace('...', '')
        return s

    def get_class_label(g, cls):
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        return labels[0] if labels else None

    def get_class_descriptions(g, cls):
        descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
        return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


    def get_complex_expression_label(g, node):
        if (node, RDF.type, OWL.Restriction) in g:
            prop = list(g.objects(node, OWL.onProperty))
            val = list(g.objects(node, OWL.someValuesFrom))
            if prop and val:
                prop_label = get_class_label(g, prop[0])
                val_label = get_class_label(g, val[0])
                return f"Restriction on {prop_label} some {val_label}"
        elif (node, RDF.type, OWL.Class) in g:
            intersection = list(g.objects(node, OWL.intersectionOf))
            if intersection:
                components = []
                for item in g.items(intersection[0]):
                    if isinstance(item, URIRef):
                        component_label = get_class_label(g, item)
                        if component_label:
                            components.append(component_label)
                    elif isinstance(item, BNode):
                        restriction_label = get_complex_expression_label(g, item)
                        if restriction_label:
                            components.append(restriction_label)
                if components:
                    return f"Intersection of {' and '.join(components)}"
        return None

    def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
        if file_path.endswith('.ttl'):
            file_format = 'ttl'
        elif file_path.endswith('.owl'):
            file_format = 'xml'
        elif file_path.endswith('.xrdf'):
            file_format = 'xml'
        else:
            raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
        
        g.parse(file_path, format=file_format)

        normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

        classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

        data = []
        relations = []
        found_class_labels = set()

        for cls in classes:
            if isinstance(cls, URIRef):  # Check if the subject is a URI
                labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
                description = get_class_descriptions(g, cls)

                for label in labels:
                    if label is not None:
                        normalized_label = normalize_string(label)
                        found_class_labels.add(normalized_label)

                        if normalized_label in normalized_class_names_to_check:
                            # Check if already processed in this iteration
                            if str(cls) not in processed_classes:
                                processed_classes.add(str(cls))
                                data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                            for obj in g.objects(cls, RDFS.subClassOf):
                                if isinstance(obj, URIRef):  # Check if the object is a URI
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                            for obj in g.objects(cls, OWL.equivalentClass):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                        # Check for owl:intersectionOf
                                        intersections = list(g.objects(cls, OWL.intersectionOf))
                                        for intersection in intersections:
                                            if isinstance(intersection, BNode):
                                                components = []
                                                for item in g.items(intersection):
                                                    if isinstance(item, URIRef):
                                                        component_label = get_class_label(g, item)
                                                        if component_label:
                                                            components.append(component_label)
                                                    elif isinstance(item, BNode):
                                                        restriction_labels = []
                                                        for restriction_item in g.items(item):
                                                            if isinstance(restriction_item, BNode):
                                                                restriction_label = get_complex_expression_label(g, restriction_item)
                                                                if restriction_label:
                                                                    restriction_labels.append(restriction_label)
                                                        if restriction_labels:
                                                            components.append("Intersection of " + " and ".join(restriction_labels))
                                                
                                                if components:
                                                    data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                                else:
                                    # Handle blank nodes for equivalentClass
                                    obj_label = get_complex_expression_label(g, obj)
                                    obj_description = "Complex class expression"

                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                        # Check for owl:intersectionOf
                                        intersections = list(g.objects(cls, OWL.intersectionOf))
                                        for intersection in intersections:
                                            if isinstance(intersection, BNode):
                                                components = []
                                                for item in g.items(intersection):
                                                    if isinstance(item, URIRef):
                                                        component_label = get_class_label(g, item)
                                                        if component_label:
                                                            components.append(component_label)
                                                    elif isinstance(item, BNode):
                                                        restriction_labels = []
                                                        for restriction_item in g.items(item):
                                                            if isinstance(restriction_item, BNode):
                                                                restriction_label = get_complex_expression_label(g, restriction_item)
                                                                if restriction_label:
                                                                    restriction_labels.append(restriction_label)
                                                        if restriction_labels:
                                                            components.append("Intersection of " + " and ".join(restriction_labels))
                                                
                                                if components:
                                                    data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                            for obj in g.objects(cls, DCTERMS.isPartOf):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                

        return data, relations, found_class_labels


    def filter_relations(all_relations, initial_class_names_to_check):
        normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
        return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


    def print_hierarchy(class_name, relations, g, writer):
        def recursive_print(class_name, depth=0):
            for relation in relations:
                if normalize_string(relation[2]) == normalize_string(class_name):
                    subject_description = get_class_descriptions(g, URIRef(relation[1]))
                    object_description = get_class_descriptions(g, URIRef(relation[4]))
                    writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                    indent = '  ' * depth
                    print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                    recursive_print(relation[5], depth + 1)

        recursive_print(class_name)


    output_hierarchy_file = "class_hierarchy.csv"
    class_output_file = "ontology_classes.csv"
    relations_output_file = "ontology_relations.csv"

    all_data = []
    all_relations = []
    all_found_class_labels = set()

    class_names_to_check = initial_class_names_to_check

    max_iterations = 10
    iteration_count = 0
    g = Graph()
    processed_classes = set()
    processed_relations = set()
    last_class_name_written = None  # Track the last class name written


    while class_names_to_check and iteration_count < max_iterations:
        iteration_count += 1
        new_data = []
        new_relations = []
        new_found_class_labels = set()

        for ontology_file in ontology_files:
            file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
            new_data.extend(file_data)
            new_relations.extend(file_relations)
            new_found_class_labels.update(found_class_labels)

        all_data.extend(new_data)
        all_relations.extend(new_relations)
        all_found_class_labels.update(new_found_class_labels)

        class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
        class_names_to_check = [normalize_string(name) for name in class_names_to_check]

        # Filter and save class data
        filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
        with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_data:
                current_class_name = filtered_data[0][2]  # Get the class name from the first row
                if current_class_name != last_class_name_written:
                    # writer.writerow(['------'])  # Write separator
                    last_class_name_written = current_class_name
            writer.writerows(filtered_data)

        # Filter and save class relations
        filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
        with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_relations:
                current_class_name = filtered_relations[0][2]  # Get the class name from the first row
                if current_class_name != last_class_name_written:
                    # writer.writerow(['------'])  # Write separator
                    last_class_name_written = current_class_name
            writer.writerows(filtered_relations)




    print(f"Filtered class data has been saved to {class_output_file}")
    print(f"Filtered class relations have been saved to {relations_output_file}")

    print("\nInitial class names found in the output:")
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        found = False
        for label in all_found_class_labels:
            if normalized_class_name in label:
                found = True
                print(f"Class '{class_name}' found in:")
                for ontology_file in ontology_files:
                    file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                    normalized_labels = [normalize_string(l) for l in found_class_labels]
                    if normalized_class_name in normalized_labels:
                        print(f"- {ontology_file}")
                break
        if not found:
            print(f"Class '{class_name}' not found in the output.")

    print("\nSaving class hierarchy to CSV file:")
    with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
        for class_name in initial_class_names_to_check:
            normalized_class_name = normalize_string(class_name)
            print_hierarchy(normalized_class_name, all_relations, g, writer)

    print(f"Class hierarchy has been saved to {output_hierarchy_file}")

    def save_intersection_info_to_csv(data, output_file):
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
            for row in data:
                writer.writerow(row)

    # Inside your main loop where you process ontology files:
    intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
    save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

In [7]:
# List of ontology files to process
ontology_files = [
    # # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    # "../Ontologies/pmdco_core.ttl",
    # "../Ontologies/nfdicore_2.ttl",
    # # # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # # # "../Ontologies/owlapi.xrdf",
    # # "../Ontologies/schemaorg.owl",
    # # # "../Ontologies/MaterialsMine.xrdf",
    # # # '../Ontologies/emmo.owl', ### has problem of reading file
    # # # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # # # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # # # "../Ontologies/oboe.owl",
    # "../Ontologies/fabio.ttl",
    # "../Ontologies/MatWerk.xrdf",
    # "../Ontologies/Materials_Data_Science.xrdf",
    # "../Ontologies/Materials_Data_Science.ttl",
    # "../Ontologies/ncit.owl",
    # Add more file paths as needed
]

In [8]:
# List of class names to check
# initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
initial_class_names_to_check = [ 'drug']

#### Define the output files
output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

In [9]:
process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file)

Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'drug' found in:


Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


- ../Ontologies/materialsmine_converted.ttl

Saving class hierarchy to CSV file:
http://semanticscience.org/resource/Drug drug is subClassOf http://semanticscience.org/resource/ChemicalSubstance ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/Drug', 'drug', 'subClassOf', 'http://semanticscience.org/resource/ChemicalSubstance', 'chemical substance', 'A chemical substance is a chemical entity composed of two or more weakly (non-covalently) interacting chemical entities.'] chemical substance (from ../Ontologies/materialsmine_converted.ttl)
  http://semanticscience.org/resource/ChemicalSubstance chemical substance is subClassOf http://semanticscience.org/resource/ChemicalEntity ['../Ontologies/materialsmine_converted.ttl', 'http://semanticscience.org/resource/ChemicalSubstance', 'chemical substance', 'subClassOf', 'http://semanticscience.org/resource/ChemicalEntity', 'chemical entity', 'A chemical entity is a material entity that pertains to chemistry.'] 