In [None]:
### Starting v4

In [None]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

In [None]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

In [None]:
# List of class names to check
initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
# initial_class_names_to_check = [ 'TermVariant']

In [None]:
# List of ontology files to process
ontology_files = [
    # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    "../Ontologies/nfdicore_2.ttl",
    # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/owlapi.xrdf",
    # "../Ontologies/schemaorg.owl",
    # "../Ontologies/MaterialsMine.xrdf",
    # '../Ontologies/emmo.owl', ### has problem of reading file
    # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # "../Ontologies/oboe.owl",
    "../Ontologies/fabio.ttl",
    # Add more file paths as needed
]

In [None]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

In [None]:
def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


In [None]:
# def get_complex_expression_label(g, node):
#     if (node, RDF.type, OWL.Restriction) in g:
#         prop = list(g.objects(node, OWL.onProperty))
#         val = list(g.objects(node, OWL.someValuesFrom))
#         if prop and val:
#             prop_label = get_class_label(g, prop[0])
#             val_label = get_class_label(g, val[0])
#             return f"Restriction on {prop_label} some {val_label}"
#     elif (node, RDF.type, OWL.Class) in g:
#         intersection = list(g.objects(node, OWL.intersectionOf))
#         if intersection:
#             components = []
#             for item in g.items(intersection[0]):
#                 if isinstance(item, URIRef):
#                     component_label = get_class_label(g, item)
#                     if component_label:
#                         components.append(component_label)
#                 elif isinstance(item, BNode):
#                     restriction_label = get_complex_expression_label(g, item)
#                     if restriction_label:
#                         components.append(restriction_label)
#             if components:
#                 return f"Intersection of {' and '.join(components)}"
#     return None

def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        equivalent_classes = list(g.objects(node, OWL.equivalentClass))
        if equivalent_classes:
            for equivalent_class in equivalent_classes:
                if isinstance(equivalent_class, BNode):
                    intersections = list(g.objects(equivalent_class, OWL.intersectionOf))
                    if intersections:
                        components = []
                        for intersection in g.items(intersections[0]):
                            if isinstance(intersection, URIRef):
                                component_label = get_class_label(g, intersection)
                                if component_label:
                                    components.append(component_label)
                            elif isinstance(intersection, BNode):
                                restriction_labels = []
                                for restriction_item in g.items(intersection):
                                    if isinstance(restriction_item, BNode):
                                        restriction_label = get_complex_expression_label(g, restriction_item)
                                        if restriction_label:
                                            restriction_labels.append(restriction_label)
                                if restriction_labels:
                                    components.append("Intersection of " + " and ".join(restriction_labels))
                        if components:
                            return f"Equivalent to {' and '.join(components)}"
                else:
                    return get_complex_expression_label(g, equivalent_class)
    return None


In [None]:
# def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
#     if file_path.endswith('.ttl'):
#         file_format = 'ttl'
#     elif file_path.endswith('.owl'):
#         file_format = 'xml'
#     elif file_path.endswith('.xrdf'):
#         file_format = 'xml'
#     else:
#         raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
#     g.parse(file_path, format=file_format)

#     normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

#     classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

#     data = []
#     relations = []
#     found_class_labels = set()

#     for cls in classes:
#         if isinstance(cls, URIRef):  # Check if the subject is a URI
#             labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
#             description = get_class_descriptions(g, cls)

#             for label in labels:
#                 if label is not None:
#                     normalized_label = normalize_string(label)
#                     found_class_labels.add(normalized_label)

#                     if normalized_label in normalized_class_names_to_check:
#                         # Check if already processed in this iteration
#                         if str(cls) not in processed_classes:
#                             processed_classes.add(str(cls))
#                             data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

#                         for obj in g.objects(cls, RDFS.subClassOf):
#                             if isinstance(obj, URIRef):  # Check if the object is a URI
#                                 obj_label = get_class_label(g, obj)
#                                 obj_description = get_class_descriptions(g, obj)

#                                 # Check if already processed in this iteration
#                                 if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'subClassOf', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

#                         for obj in g.objects(cls, OWL.equivalentClass):
#                             if isinstance(obj, URIRef):
#                                 obj_label = get_class_label(g, obj)
#                                 obj_description = get_class_descriptions(g, obj)

#                                 # Check if already processed in this iteration
#                                 if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'equivalentClass', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
#                             else:
#                                 # Handle blank nodes for equivalentClass
#                                 obj_label = get_complex_expression_label(g, obj)
#                                 obj_description = "Complex class expression"

#                                 if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'equivalentClass', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

#                         for obj in g.objects(cls, DCTERMS.isPartOf):
#                             if isinstance(obj, URIRef):
#                                 obj_label = get_class_label(g, obj)
#                                 obj_description = get_class_descriptions(g, obj)

#                                 # Check if already processed in this iteration
#                                 if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
#                                     processed_relations.add((str(cls), 'isPartOf', str(obj)))
#                                     relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

#             # Check for owl:intersectionOf
#             intersections = list(g.objects(cls, OWL.intersectionOf))
#             for intersection in intersections:
#                 if isinstance(intersection, BNode):
#                     components = []
#                     for item in g.items(intersection):
#                         if isinstance(item, URIRef):
#                             component_label = get_class_label(g, item)
#                             if component_label:
#                                 components.append(component_label)
#                         elif isinstance(item, BNode):
#                             restriction_labels = []
#                             for restriction_item in g.items(item):
#                                 if isinstance(restriction_item, BNode):
#                                     restriction_label = get_complex_expression_label(g, restriction_item)
#                                     if restriction_label:
#                                         restriction_labels.append(restriction_label)
#                             if restriction_labels:
#                                 components.append("Intersection of " + " and ".join(restriction_labels))
                    
#                     if components:
#                         data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

#     return data, relations, found_class_labels


def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                                    # Handle owl:intersectionOf
                                    intersections = list(g.objects(obj, OWL.intersectionOf))
                                    if intersections:
                                        components = []
                                        for intersection in g.items(intersections[0]):
                                            if isinstance(intersection, URIRef):
                                                component_label = get_class_label(g, intersection)
                                                if component_label:
                                                    components.append(component_label)
                                            elif isinstance(intersection, BNode):
                                                restriction_labels = []
                                                for restriction_item in g.items(intersection):
                                                    if isinstance(restriction_item, BNode):
                                                        restriction_label = get_complex_expression_label(g, restriction_item)
                                                        if restriction_label:
                                                            restriction_labels.append(restriction_label)
                                                if restriction_labels:
                                                    components.append("Intersection of " + " and ".join(restriction_labels))

                                        if components:
                                            data.append([file_path, str(cls), "", f"Equivalent to {' and '.join(components)}"])

                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                                    # Handle owl:intersectionOf
                                    intersections = list(g.objects(obj, OWL.intersectionOf))
                                    if intersections:
                                        components = []
                                        for intersection in g.items(intersections[0]):
                                            if isinstance(intersection, URIRef):
                                                component_label = get_class_label(g, intersection)
                                                if component_label:
                                                    components.append(component_label)
                                            elif isinstance(intersection, BNode):
                                                restriction_labels = []
                                                for restriction_item in g.items(intersection):
                                                    if isinstance(restriction_item, BNode):
                                                        restriction_label = get_complex_expression_label(g, restriction_item)
                                                        if restriction_label:
                                                            restriction_labels.append(restriction_label)
                                                if restriction_labels:
                                                    components.append("Intersection of " + " and ".join(restriction_labels))

                                        if components:
                                            data.append([file_path, str(cls), "", f"Equivalent to {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

    return data, relations, found_class_labels


In [None]:
def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


In [None]:
def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)

In [None]:
output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written

In [None]:
while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)

In [None]:


print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

# def save_intersection_info_to_csv(data, output_file):
#     with open(output_file, mode='w', newline='', encoding='utf-8') as file:
#         writer = csv.writer(file)
#         writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
#         for row in data:
#             writer.writerow(row)

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            if row[3].startswith("Intersection of"):
                writer.writerow(row)



# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

In [1]:
# List of class names to check
# initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
initial_class_names_to_check = [ 'Strength']

In [2]:
# List of ontology files to process
ontology_files = [
    # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    "../Ontologies/nfdicore_2.ttl",
    # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/owlapi.xrdf",
    # "../Ontologies/schemaorg.owl",
    # "../Ontologies/MaterialsMine.xrdf",
    # '../Ontologies/emmo.owl', ### has problem of reading file
    # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    "../Ontologies/oboe.owl",
    "../Ontologies/fabio.ttl",
    # Add more file paths as needed
]

In [None]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

In [3]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode

# Define namespaces and helper functions here (same as in your original code)...

def process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file):
    

    directory = '.'
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            os.remove(os.path.join(directory, filename))


    # Define namespaces (assuming these are already defined in your script)
    ex = Namespace("http://example.org/ontology/")
    sio = Namespace("http://semanticscience.org/resource/")
    skos = Namespace("http://www.w3.org/2004/02/skos/core#")
    owl = Namespace("http://www.w3.org/2002/07/owl#")
    rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    materialsmine = Namespace("http://materialsmine.org/ns/")
    bibo = Namespace("http://purl.org/ontology/bibo/")
    rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
    xml = Namespace("http://www.w3.org/XML/1998/namespace")
    foaf = Namespace("http://xmlns.com/foaf/0.1/")
    dcterms = Namespace("http://purl.org/dc/terms/")
    isPartOf = dcterms.isPartOf
    DCTERMS = Namespace("http://purl.org/dc/terms/")

    def normalize_string(s):
        s = s.lower()
        s = re.sub(r'[_\-+\s]+', '', s)
        s = s.replace('...', '')
        return s

    def get_class_label(g, cls):
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        return labels[0] if labels else None

    def get_class_descriptions(g, cls):
        descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
        return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


    def get_complex_expression_label(g, node):
        if (node, RDF.type, OWL.Restriction) in g:
            prop = list(g.objects(node, OWL.onProperty))
            val = list(g.objects(node, OWL.someValuesFrom))
            if prop and val:
                prop_label = get_class_label(g, prop[0])
                val_label = get_class_label(g, val[0])
                return f"Restriction on {prop_label} some {val_label}"
        elif (node, RDF.type, OWL.Class) in g:
            intersection = list(g.objects(node, OWL.intersectionOf))
            if intersection:
                components = []
                for item in g.items(intersection[0]):
                    if isinstance(item, URIRef):
                        component_label = get_class_label(g, item)
                        if component_label:
                            components.append(component_label)
                    elif isinstance(item, BNode):
                        restriction_label = get_complex_expression_label(g, item)
                        if restriction_label:
                            components.append(restriction_label)
                if components:
                    return f"Intersection of {' and '.join(components)}"
        return None

    def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
        if file_path.endswith('.ttl'):
            file_format = 'ttl'
        elif file_path.endswith('.owl'):
            file_format = 'xml'
        elif file_path.endswith('.xrdf'):
            file_format = 'xml'
        else:
            raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
        
        g.parse(file_path, format=file_format)

        normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

        classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

        data = []
        relations = []
        found_class_labels = set()

        for cls in classes:
            if isinstance(cls, URIRef):  # Check if the subject is a URI
                labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
                description = get_class_descriptions(g, cls)

                for label in labels:
                    if label is not None:
                        normalized_label = normalize_string(label)
                        found_class_labels.add(normalized_label)

                        if normalized_label in normalized_class_names_to_check:
                            # Check if already processed in this iteration
                            if str(cls) not in processed_classes:
                                processed_classes.add(str(cls))
                                data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                            for obj in g.objects(cls, RDFS.subClassOf):
                                if isinstance(obj, URIRef):  # Check if the object is a URI
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                            for obj in g.objects(cls, OWL.equivalentClass):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                        # Check for owl:intersectionOf
                                        intersections = list(g.objects(cls, OWL.intersectionOf))
                                        for intersection in intersections:
                                            if isinstance(intersection, BNode):
                                                components = []
                                                for item in g.items(intersection):
                                                    if isinstance(item, URIRef):
                                                        component_label = get_class_label(g, item)
                                                        if component_label:
                                                            components.append(component_label)
                                                    elif isinstance(item, BNode):
                                                        restriction_labels = []
                                                        for restriction_item in g.items(item):
                                                            if isinstance(restriction_item, BNode):
                                                                restriction_label = get_complex_expression_label(g, restriction_item)
                                                                if restriction_label:
                                                                    restriction_labels.append(restriction_label)
                                                        if restriction_labels:
                                                            components.append("Intersection of " + " and ".join(restriction_labels))
                                                
                                                if components:
                                                    data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                                else:
                                    # Handle blank nodes for equivalentClass
                                    obj_label = get_complex_expression_label(g, obj)
                                    obj_description = "Complex class expression"

                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                        # Check for owl:intersectionOf
                                        intersections = list(g.objects(cls, OWL.intersectionOf))
                                        for intersection in intersections:
                                            if isinstance(intersection, BNode):
                                                components = []
                                                for item in g.items(intersection):
                                                    if isinstance(item, URIRef):
                                                        component_label = get_class_label(g, item)
                                                        if component_label:
                                                            components.append(component_label)
                                                    elif isinstance(item, BNode):
                                                        restriction_labels = []
                                                        for restriction_item in g.items(item):
                                                            if isinstance(restriction_item, BNode):
                                                                restriction_label = get_complex_expression_label(g, restriction_item)
                                                                if restriction_label:
                                                                    restriction_labels.append(restriction_label)
                                                        if restriction_labels:
                                                            components.append("Intersection of " + " and ".join(restriction_labels))
                                                
                                                if components:
                                                    data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                            for obj in g.objects(cls, DCTERMS.isPartOf):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                

        return data, relations, found_class_labels


    def filter_relations(all_relations, initial_class_names_to_check):
        normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
        return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


    def print_hierarchy(class_name, relations, g, writer):
        def recursive_print(class_name, depth=0):
            for relation in relations:
                if normalize_string(relation[2]) == normalize_string(class_name):
                    subject_description = get_class_descriptions(g, URIRef(relation[1]))
                    object_description = get_class_descriptions(g, URIRef(relation[4]))
                    writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                    indent = '  ' * depth
                    print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                    recursive_print(relation[5], depth + 1)

        recursive_print(class_name)


    output_hierarchy_file = "class_hierarchy.csv"
    class_output_file = "ontology_classes.csv"
    relations_output_file = "ontology_relations.csv"

    all_data = []
    all_relations = []
    all_found_class_labels = set()

    class_names_to_check = initial_class_names_to_check

    max_iterations = 2
    iteration_count = 0
    g = Graph()
    processed_classes = set()
    processed_relations = set()
    last_class_name_written = None  # Track the last class name written


    while class_names_to_check and iteration_count < max_iterations:
        iteration_count += 1
        new_data = []
        new_relations = []
        new_found_class_labels = set()

        for ontology_file in ontology_files:
            file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
            new_data.extend(file_data)
            new_relations.extend(file_relations)
            new_found_class_labels.update(found_class_labels)

        all_data.extend(new_data)
        all_relations.extend(new_relations)
        all_found_class_labels.update(new_found_class_labels)

        class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
        class_names_to_check = [normalize_string(name) for name in class_names_to_check]

        # Filter and save class data
        filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
        with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_data:
                current_class_name = filtered_data[0][2]  # Get the class name from the first row
                if current_class_name != last_class_name_written:
                    # writer.writerow(['------'])  # Write separator
                    last_class_name_written = current_class_name
            writer.writerows(filtered_data)

        # Filter and save class relations
        filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
        with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_relations:
                current_class_name = filtered_relations[0][2]  # Get the class name from the first row
                if current_class_name != last_class_name_written:
                    # writer.writerow(['------'])  # Write separator
                    last_class_name_written = current_class_name
            writer.writerows(filtered_relations)




    print(f"Filtered class data has been saved to {class_output_file}")
    print(f"Filtered class relations have been saved to {relations_output_file}")

    print("\nInitial class names found in the output:")
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        found = False
        for label in all_found_class_labels:
            if normalized_class_name in label:
                found = True
                print(f"Class '{class_name}' found in:")
                for ontology_file in ontology_files:
                    file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                    normalized_labels = [normalize_string(l) for l in found_class_labels]
                    if normalized_class_name in normalized_labels:
                        print(f"- {ontology_file}")
                break
        if not found:
            print(f"Class '{class_name}' not found in the output.")

    print("\nSaving class hierarchy to CSV file:")
    with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
        for class_name in initial_class_names_to_check:
            normalized_class_name = normalize_string(class_name)
            print_hierarchy(normalized_class_name, all_relations, g, writer)

    print(f"Class hierarchy has been saved to {output_hierarchy_file}")

    def save_intersection_info_to_csv(data, output_file):
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
            for row in data:
                writer.writerow(row)

    # Inside your main loop where you process ontology files:
    intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
    save_intersection_info_to_csv(intersection_data, "intersection_info.csv")

In [None]:
# List of ontology files to process
ontology_files = [
    # # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    # "../Ontologies/pmdco_core.ttl",
    # "../Ontologies/nfdicore_2.ttl",
    # # # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    # "../Ontologies/emmo.ttl",
    # # # "../Ontologies/owlapi.xrdf",
    # # "../Ontologies/schemaorg.owl",
    # # # "../Ontologies/MaterialsMine.xrdf",
    # # # '../Ontologies/emmo.owl', ### has problem of reading file
    # # # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # # # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # # # "../Ontologies/oboe.owl",
    # "../Ontologies/fabio.ttl",
    # "../Ontologies/MatWerk.xrdf",
    # "../Ontologies/Materials_Data_Science.xrdf",
    # "../Ontologies/Materials_Data_Science.ttl",
    # "../Ontologies/ncit.owl",
    # Add more file paths as needed
]

In [6]:
# List of class names to check
# initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
initial_class_names_to_check = [ 'stiffness']
# initial_class_names_to_check = [
#     "Material",
#     "Self-Healing Material",
#     # "Polymer",
#     "Elastomer",
#     "Metal",
#     "Ceramic Matrix Composite",
#     "Cementitious Material",
#     "Repair Mechanism",
#     "Self-Sealing Phase",
#     "Self-Healing Phase",
#     "Mechanical Functionality",
#     "Mechanical Properties",
#     "Stiffness",
#     "Strength",
#     "Stimulus",
#     "Adhesive",
#     # "Repair Agent",
#     # "Particle",
#     # "Bacterial Spore",
#     # "Chemical Reaction",
#     # "Biomimetic Material System",
#     # "Self-Sealing Principle",
#     # "Self-Healing Principle",
#     # "Plant",
#     # "External Wound",
#     # "Internal Incision",
#     # "Delosperma Cooperi",
#     # "Leaf",
#     # "Numerical Model",
#     # "Analytical Model",
#     # "Polymer",
#     # "Self-Healing Polymer",
#     # "Actuator",
#     # "Multilayer Actuator",
#     # "Mechanical Metamaterial",
#     # "Unit Cell",
#     # "Crack",
#     # "Damage",
#     # "Stress",
#     # "Strain",
#     # "Fluidic Feature",
#     # "Porosity",
#     # "Permeability",
#     # "Mechanical Property",
#     # "Geometrical Parameter",
#     # "Nonlinear Finite Element Analysis"
# ]

#### Define the output files
output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

In [7]:
process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file)

Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'stiffness' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/nfdicore_2.ttl
- ../Ontologies/oboe.owl


Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


- ../Ontologies/fabio.ttl

Saving class hierarchy to CSV file:
https://w3id.org/pmd/co/Stiffness Stiffness is subClassOf https://w3id.org/pmd/co/ValueObject ['../Ontologies/pmdco_core.ttl', 'https://w3id.org/pmd/co/Stiffness', 'Stiffness', 'subClassOf', 'https://w3id.org/pmd/co/ValueObject', 'Value Object', 'A :ValueObject is a simple entity which represents a specific value. This value can be a numerical, textual, or a more complex data structure. If a literal value is to be specified, the :value datatype property has to be used. In cases where the value is represented by a resource (e.g. URI), the :resource object property has to be used.\n\nA value object, respectively its value, is always associated with an entity of type :Process, :ProcessingNode, or :Object (e.g. :Specimen). The value is meant to be a charactaristic of the associated entity. To express this association it is indended to use the :participant object property.\n\nA value object might also refer to a certain unit. Th

In [4]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode

def process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file):
    
    # Clear any existing CSV files
    directory = '.'
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            os.remove(os.path.join(directory, filename))

    # Define namespaces
    ex = Namespace("http://example.org/ontology/")
    sio = Namespace("http://semanticscience.org/resource/")
    skos = Namespace("http://www.w3.org/2004/02/skos/core#")
    owl = Namespace("http://www.w3.org/2002/07/owl#")
    rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
    materialsmine = Namespace("http://materialsmine.org/ns/")
    bibo = Namespace("http://purl.org/ontology/bibo/")
    rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
    xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
    xml = Namespace("http://www.w3.org/XML/1998/namespace")
    foaf = Namespace("http://xmlns.com/foaf/0.1/")
    dcterms = Namespace("http://purl.org/dc/terms/")
    isPartOf = dcterms.isPartOf
    DCTERMS = Namespace("http://purl.org/dc/terms/")

    def normalize_string(s):
        s = s.lower()
        s = re.sub(r'[_\-+\s]+', '', s)
        s = s.replace('...', '')
        return s

    def get_class_label(g, cls):
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        return labels[0] if labels else None

    def get_class_descriptions(g, cls):
        descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
        return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None

    def get_complex_expression_label(g, node):
        if (node, RDF.type, OWL.Restriction) in g:
            prop = list(g.objects(node, OWL.onProperty))
            val = list(g.objects(node, OWL.someValuesFrom))
            if prop and val:
                prop_label = get_class_label(g, prop[0])
                val_label = get_class_label(g, val[0])
                return f"Restriction on {prop_label} some {val_label}"
        elif (node, RDF.type, OWL.Class) in g:
            intersection = list(g.objects(node, OWL.intersectionOf))
            if intersection:
                components = []
                for item in g.items(intersection[0]):
                    if isinstance(item, URIRef):
                        component_label = get_class_label(g, item)
                        if component_label:
                            components.append(component_label)
                    elif isinstance(item, BNode):
                        restriction_label = get_complex_expression_label(g, item)
                        if restriction_label:
                            components.append(restriction_label)
                if components:
                    return f"Intersection of {' and '.join(components)}"
        return None

    def load_and_collect_classes_and_relations(g, class_names_to_check):
        normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

        classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

        data = []
        relations = []
        found_class_labels = set()

        for cls in classes:
            if isinstance(cls, URIRef):
                labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
                description = get_class_descriptions(g, cls)

                for label in labels:
                    if label is not None:
                        normalized_label = normalize_string(label)
                        found_class_labels.add(normalized_label)

                        if normalized_label in normalized_class_names_to_check:
                            if str(cls) not in processed_classes:
                                processed_classes.add(str(cls))
                                data.append([str(cls), str(label), str(description) if description is not None else ""])

                            for obj in g.objects(cls, RDFS.subClassOf):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                        relations.append([str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                            for obj in g.objects(cls, OWL.equivalentClass):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                else:
                                    obj_label = get_complex_expression_label(g, obj)
                                    obj_description = "Complex class expression"

                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                            for obj in g.objects(cls, DCTERMS.isPartOf):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                        relations.append([str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

        return data, relations, found_class_labels

    def filter_relations(all_relations, initial_class_names_to_check):
        normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
        return [relation for relation in all_relations if normalize_string(relation[1]) in normalized_initial_class_names or normalize_string(relation[4]) in normalized_initial_class_names]

    def print_hierarchy(class_name, relations, g, writer):
        def recursive_print(class_name, depth=0):
            for relation in relations:
                if normalize_string(relation[1]) == normalize_string(class_name):
                    subject_description = get_class_descriptions(g, URIRef(relation[0]))
                    object_description = get_class_descriptions(g, URIRef(relation[3]))
                    writer.writerow([relation[0], relation[1], subject_description if subject_description is not None else "", relation[2], relation[3], relation[4], object_description if object_description is not None else "", relation[5]])
                    indent = '  ' * depth
                    print(f"{indent}{relation[0]} {relation[1]} is {relation[2]} {relation[3]} {relation[4]} (from {relation[5]})")
                    recursive_print(relation[4], depth + 1)

        recursive_print(class_name)

    # Load the RDF graph once
    g = Graph()
    for ontology_file in ontology_files:
        if ontology_file.endswith('.ttl'):
            file_format = 'ttl'
        elif ontology_file.endswith('.owl'):
            file_format = 'xml'
        elif ontology_file.endswith('.xrdf'):
            file_format = 'xml'
        else:
            raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
        g.parse(ontology_file, format=file_format)

    all_data = []
    all_relations = []
    all_found_class_labels = set()

    class_names_to_check = initial_class_names_to_check

    max_iterations = 2
    iteration_count = 0
    processed_classes = set()
    processed_relations = set()
    last_class_name_written = None  # Track the last class name written

    while class_names_to_check and iteration_count < max_iterations:
        iteration_count += 1
        new_data, new_relations, new_found_class_labels = load_and_collect_classes_and_relations(g, class_names_to_check)

        all_data.extend(new_data)
        all_relations.extend(new_relations)
        all_found_class_labels.update(new_found_class_labels)

        class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
        class_names_to_check = [normalize_string(name) for name in class_names_to_check]

        # Filter and save class data
        filtered_data = [row for row in new_data if normalize_string(row[1]) in {normalize_string(name) for name in initial_class_names_to_check}]
        with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_data:
                current_class_name = filtered_data[0][1]  # Get the class name from the first row
                if current_class_name != last_class_name_written:
                    last_class_name_written = current_class_name
            writer.writerows(filtered_data)

        # Filter and save class relations
        filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
        with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_relations:
                current_class_name = filtered_relations[0][1]  # Get the class name from the first row
                if current_class_name != last_class_name_written:
                    last_class_name_written = current_class_name
            writer.writerows(filtered_relations)

    print(f"Filtered class data has been saved to {class_output_file}")
    print(f"Filtered class relations have been saved to {relations_output_file}")

    print("\nInitial class names found in the output:")
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        found = False
        for label in all_found_class_labels:
            if normalized_class_name in label:
                found = True
                print(f"Class '{class_name}' found in:")
                break
        if not found:
            print(f"Class '{class_name}' not found in the output.")

    print("\nSaving class hierarchy to CSV file:")
    with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
        for class_name in initial_class_names_to_check:
            normalized_class_name = normalize_string(class_name)
            print_hierarchy(normalized_class_name, all_relations, g, writer)

    print(f"Class hierarchy has been saved to {output_hierarchy_file}")

    def save_intersection_info_to_csv(data, output_file):
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
            for row in data:
                writer.writerow(row)

    # Save intersection information
    intersection_data = [row for row in all_data if row[2].startswith("Intersection of")]
    save_intersection_info_to_csv(intersection_data, "intersection_info.csv")


In [5]:
# List of ontology files to process
ontology_files = [
    # # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    "../Ontologies/nfdicore_2.ttl",
    # # # # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    "../Ontologies/emmo.ttl",
    # # # # "../Ontologies/owlapi.xrdf",
    # # # "../Ontologies/schemaorg.owl",
    # # # # "../Ontologies/MaterialsMine.xrdf",
    # # # # '../Ontologies/emmo.owl', ### has problem of reading file
    # # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # # # # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # # # # "../Ontologies/oboe.owl",
    "../Ontologies/fabio.ttl",
    # "../Ontologies/MatWerk.xrdf",
    # "../Ontologies/Materials_Data_Science.xrdf",
    # "../Ontologies/Materials_Data_Science.ttl",
    # "../Ontologies/ncit.owl",
    # Add more file paths as needed
]

In [6]:
# List of class names to check
# initial_class_names_to_check = [ 'Compression','AmperePerJoule','nfdi','stress', 'Advertiser+content_Article', 'Tensiletest']
# initial_class_names_to_check = [ 'stress']

initial_class_names_to_check = [
    "Material",
    "Self-Healing Material",
    # "Polymer",
    # "Elastomer",
    "Metal",
    "Ceramic Matrix Composite",
    "Cementitious Material",
    "Repair Mechanism",
    "Self-Sealing Phase",
    "Self-Healing Phase",
    "Mechanical Functionality",
    "Mechanical Properties",
    "Stiffness",
    "Strength",
    "Stimulus",
    "Adhesive",
    "Repair Agent",
    "Particle",
    "Bacterial Spore",
    "Chemical Reaction",
    "Biomimetic Material System",
    "Self-Sealing Principle",
    "Self-Healing Principle",
    "Plant",
    "External Wound",
    "Internal Incision",
    "Delosperma Cooperi",
    "Leaf",
    "Numerical Model",
    "Analytical Model",
    # "Self-Healing Polymer",
    "Actuator",
    "Multilayer Actuator",
    "Mechanical Metamaterial",
    "Unit Cell",
    "Crack",
    "Damage",
    "Stress",
    "Strain",
    "Fluidic Feature",
    "Porosity",
    "Permeability",
    "Mechanical Property",
    "Geometrical Parameter",
    "Nonlinear Finite Element Analysis"
]


#### Define the output files
output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

In [7]:
process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file)


Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'Material' found in:
Class 'Self-Healing Material' not found in the output.
Class 'Metal' found in:
Class 'Ceramic Matrix Composite' not found in the output.
Class 'Cementitious Material' not found in the output.
Class 'Repair Mechanism' not found in the output.
Class 'Self-Sealing Phase' not found in the output.
Class 'Self-Healing Phase' not found in the output.
Class 'Mechanical Functionality' not found in the output.
Class 'Mechanical Properties' not found in the output.
Class 'Stiffness' found in:
Class 'Strength' found in:
Class 'Stimulus' not found in the output.
Class 'Adhesive' not found in the output.
Class 'Repair Agent' not found in the output.
Class 'Particle' found in:
Class 'Bacterial Spore' not found in the output.
Class 'Chemical Reaction' found in:
Class 'Biomimetic Material System' not found in t

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


https://w3id.org/pmd/co/Stiffness Stiffness is subClassOf https://w3id.org/pmd/co/ValueObject Value Object (from A :ValueObject is a simple entity which represents a specific value. This value can be a numerical, textual, or a more complex data structure. If a literal value is to be specified, the :value datatype property has to be used. In cases where the value is represented by a resource (e.g. URI), the :resource object property has to be used.

A value object, respectively its value, is always associated with an entity of type :Process, :ProcessingNode, or :Object (e.g. :Specimen). The value is meant to be a charactaristic of the associated entity. To express this association it is indended to use the :participant object property.

A value object might also refer to a certain unit. The :unit property might be used (e.g. with QUDT ontology).

Instances of a value object might be specified as a specific Parameter, namely a SetPoint (nominal value), or Measurement. With :Setpoint the 

Complex class expression does not look like a valid URI, trying to serialize this will break.


  https://w3id.org/emmo#EMMO_6f5af708_f825_4feb_a0d1_a8d813d3022b Object is equivalentClass Complex class expression  (from Complex class expression)
  https://w3id.org/pmd/co/Object Object is subClassOf http://www.w3.org/ns/prov#Entity  (from )
  http://semanticscience.org/resource/Object object is subClassOf http://semanticscience.org/resource/Entity entity (from Every thing is an entity.)
    http://semanticscience.org/resource/Entity entity is subClassOf http://www.w3.org/2002/07/owl#Thing  (from )
http://semanticscience.org/resource/ChemicalReaction chemical reaction is subClassOf http://semanticscience.org/resource/ChemicalInteraction chemical interaction (from A chemical interaction is a biochemical process in which chemical entities interact through some set of attractive forces.)
  http://semanticscience.org/resource/ChemicalInteraction chemical interaction is subClassOf http://semanticscience.org/resource/Interacting interacting (from interacting is a process characterized by

Complex class expression does not look like a valid URI, trying to serialize this will break.


http://materialsmine.org/ns/Stress Stress is subClassOf http://materialsmine.org/ns/MechanicalProperty Mechanical Property (from A materials property related to the response of a material under some external applied load.)
  http://materialsmine.org/ns/MechanicalProperty Mechanical Property is subClassOf http://semanticscience.org/resource/Quantity Amount (from A quantity is an informational entity that gives the magnitude of a property.)
    http://semanticscience.org/resource/Quantity Amount is subClassOf http://semanticscience.org/resource/MeasurementValue measurement value (from A measurement value is a quantitative description that reflects the magnitude of some attribute.)
      http://semanticscience.org/resource/MeasurementValue measurement value is subClassOf http://semanticscience.org/resource/Number number (from A number is a tensor of rank 0.)
        http://semanticscience.org/resource/Number number is subClassOf http://semanticscience.org/resource/Scalar scalar (from a sc

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


    http://semanticscience.org/resource/Quantity Amount is equivalentClass Complex class expression  (from Complex class expression)
https://w3id.org/emmo#EMMO_d1917609_db5e_4b8a_9b76_ef1d6f860a81 Stress is subClassOf https://w3id.org/emmo#EMMO_2946d40b_24a1_47fa_8176_e3f79bb45064 ISQDerivedQuantity (from )
  https://w3id.org/emmo#EMMO_2946d40b_24a1_47fa_8176_e3f79bb45064 ISQDerivedQuantity is subClassOf https://w3id.org/emmo#EMMO_71f6ab56_342c_484b_bbe0_de86b7367cb3 DerivedQuantity (from )
  https://w3id.org/emmo#EMMO_2946d40b_24a1_47fa_8176_e3f79bb45064 ISQDerivedQuantity is subClassOf https://w3id.org/emmo#EMMO_f35cff4d_dc09_44cf_a729_22fb79e3bfb2 InternationalSystemOfQuantity (from )
    https://w3id.org/emmo#EMMO_f35cff4d_dc09_44cf_a729_22fb79e3bfb2 InternationalSystemOfQuantity is subClassOf https://w3id.org/emmo#EMMO_9c407ac0_fd4c_4178_8763_95fad9fe29ec StandardizedPhysicalQuantity (from )
      https://w3id.org/emmo#EMMO_9c407ac0_fd4c_4178_8763_95fad9fe29ec StandardizedPhysical

Complex class expression does not look like a valid URI, trying to serialize this will break.


http://materialsmine.org/ns/Strain Strain is subClassOf http://materialsmine.org/ns/MechanicalProperty Mechanical Property (from A materials property related to the response of a material under some external applied load.)
  http://materialsmine.org/ns/MechanicalProperty Mechanical Property is subClassOf http://semanticscience.org/resource/Quantity Amount (from A quantity is an informational entity that gives the magnitude of a property.)
    http://semanticscience.org/resource/Quantity Amount is subClassOf http://semanticscience.org/resource/MeasurementValue measurement value (from A measurement value is a quantitative description that reflects the magnitude of some attribute.)
      http://semanticscience.org/resource/MeasurementValue measurement value is subClassOf http://semanticscience.org/resource/Number number (from A number is a tensor of rank 0.)
        http://semanticscience.org/resource/Number number is subClassOf http://semanticscience.org/resource/Scalar scalar (from a sc

Complex class expression does not look like a valid URI, trying to serialize this will break.


    http://semanticscience.org/resource/Quantity Amount is equivalentClass Complex class expression  (from Complex class expression)
https://w3id.org/emmo#EMMO_acf636d4_9ac2_4ce3_960a_d54338e6cae3 Strain is subClassOf https://w3id.org/emmo#EMMO_faab3f84_e475_4a46_af9c_7d249f0b9aef RatioQuantity (from Quantities defined as ratios `Q=A/B` having equal dimensions in numerator and denominator are dimensionless quantities but still have a physical dimension defined as dim(A)/dim(B).

Johansson, Ingvar (2010). "Metrological thinking needs the notions of parametric quantities, units and dimensions". Metrologia. 47 (3): 219–230. doi:10.1088/0026-1394/47/3/012. ISSN 0026-1394.)
  https://w3id.org/emmo#EMMO_faab3f84_e475_4a46_af9c_7d249f0b9aef RatioQuantity is subClassOf https://w3id.org/emmo#EMMO_a66427d1_9932_4363_9ec5_7d91f2bfda1e ISQDimensionlessQuantity (from )
    https://w3id.org/emmo#EMMO_a66427d1_9932_4363_9ec5_7d91f2bfda1e ISQDimensionlessQuantity is subClassOf https://w3id.org/emmo#EM

Complex class expression does not look like a valid URI, trying to serialize this will break.


          http://semanticscience.org/resource/ChemicalEntity chemical entity is subClassOf http://semanticscience.org/resource/MaterialEntity Material Entity (from A material entity is a physical entity that is spatially extended, exists as a whole at any point in time and has mass. A material entity is a physical entity that is spatially extended, exists as a whole at any point in time and has mass.)
            http://semanticscience.org/resource/MaterialEntity Material Entity is subClassOf http://semanticscience.org/resource/Object object (from An object is an entity that is wholly identifiable at any instant of time during which it exists.)
              https://w3id.org/emmo#EMMO_6f5af708_f825_4feb_a0d1_a8d813d3022b Object is equivalentClass Complex class expression  (from Complex class expression)
              https://w3id.org/pmd/co/Object Object is subClassOf http://www.w3.org/ns/prov#Entity  (from )
              http://semanticscience.org/resource/Object object is subClassOf

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


        http://semanticscience.org/resource/ChemicalSubstance chemical substance is equivalentClass Complex class expression  (from Complex class expression)
        https://w3id.org/emmo#EMMO_df96cbb6_b5ee_4222_8eab_b3675df24bea ChemicalSubstance is subClassOf https://w3id.org/emmo#EMMO_bc37743c_37c4_4ec7_9d58_d1aae5567352 Substance (from )
          https://w3id.org/emmo#EMMO_bc37743c_37c4_4ec7_9d58_d1aae5567352 Substance is subClassOf https://w3id.org/emmo#EMMO_57d977ab_0036_4779_b59a_e47620afdb9c CompositePhysicalObject (from )
          https://w3id.org/emmo#EMMO_bc37743c_37c4_4ec7_9d58_d1aae5567352 Substance is subClassOf https://w3id.org/emmo#EMMO_5b2222df_4da6_442f_8244_96e9e45887d1 PhysicalSubstance (from )
            https://w3id.org/emmo#EMMO_5b2222df_4da6_442f_8244_96e9e45887d1 PhysicalSubstance is subClassOf https://w3id.org/emmo#EMMO_38b579de_4331_40e0_803d_09efa298e726 PhysicalObject (from )
        https://w3id.org/emmo#EMMO_3397f270_dfc1_4500_8f6f_4d0d85ac5f71 Chemica

Complex class expression does not look like a valid URI, trying to serialize this will break.


          https://w3id.org/emmo#EMMO_47338839_6cca_4a8e_b565_3c4d5517e2c0 ChemicalEntity is subClassOf https://w3id.org/emmo#EMMO_5b2222df_4da6_442f_8244_96e9e45887d1 PhysicalSubstance (from )
            https://w3id.org/emmo#EMMO_5b2222df_4da6_442f_8244_96e9e45887d1 PhysicalSubstance is subClassOf https://w3id.org/emmo#EMMO_38b579de_4331_40e0_803d_09efa298e726 PhysicalObject (from )
          https://w3id.org/emmo#EMMO_21205421_5783_4d3e_81e5_10c5d894a88a ChemicalEntity is subClassOf https://w3id.org/emmo#EMMO_5b2222df_4da6_442f_8244_96e9e45887d1 PhysicalSubstance (from )
            https://w3id.org/emmo#EMMO_5b2222df_4da6_442f_8244_96e9e45887d1 PhysicalSubstance is subClassOf https://w3id.org/emmo#EMMO_38b579de_4331_40e0_803d_09efa298e726 PhysicalObject (from )
          https://w3id.org/emmo#EMMO_21205421_5783_4d3e_81e5_10c5d894a88a ChemicalEntity is subClassOf https://w3id.org/emmo#EMMO_8b1367d6_0133_4b56_acc1_fa8b058169e3 CompositePhysicalParticle (from )
            https://w3i

Complex class expression does not look like a valid URI, trying to serialize this will break.
Complex class expression does not look like a valid URI, trying to serialize this will break.


          http://semanticscience.org/resource/Tensor tensor is subClassOf http://semanticscience.org/resource/MathematicalEntity mathematical entity (from A mathematical entity is an information content entity that are components of a mathematical system or can be defined in mathematical terms.)
            http://semanticscience.org/resource/MathematicalEntity mathematical entity is subClassOf http://semanticscience.org/resource/InformationContentEntity information content entity (from information content entity is an object that requires some background knowledge or procedure to correctly interpret.)
              http://semanticscience.org/resource/InformationContentEntity information content entity is subClassOf http://semanticscience.org/resource/Object object (from An object is an entity that is wholly identifiable at any instant of time during which it exists.)
                https://w3id.org/emmo#EMMO_6f5af708_f825_4feb_a0d1_a8d813d3022b Object is equivalentClass Complex class

In [None]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Define namespaces (assuming these are already defined in your script)
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

def get_class_label(g, cls):
    labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
    return labels[0] if labels else None

def get_class_descriptions(g, cls):
    descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
    return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None


def get_complex_expression_label(g, node):
    if (node, RDF.type, OWL.Restriction) in g:
        prop = list(g.objects(node, OWL.onProperty))
        val = list(g.objects(node, OWL.someValuesFrom))
        if prop and val:
            prop_label = get_class_label(g, prop[0])
            val_label = get_class_label(g, val[0])
            return f"Restriction on {prop_label} some {val_label}"
    elif (node, RDF.type, OWL.Class) in g:
        intersection = list(g.objects(node, OWL.intersectionOf))
        if intersection:
            components = []
            for item in g.items(intersection[0]):
                if isinstance(item, URIRef):
                    component_label = get_class_label(g, item)
                    if component_label:
                        components.append(component_label)
                elif isinstance(item, BNode):
                    restriction_label = get_complex_expression_label(g, item)
                    if restriction_label:
                        components.append(restriction_label)
            if components:
                return f"Intersection of {' and '.join(components)}"
    return None

def load_and_collect_classes_and_relations(file_path, class_names_to_check, g, processed_classes, processed_relations):
    if file_path.endswith('.ttl'):
        file_format = 'ttl'
    elif file_path.endswith('.owl'):
        file_format = 'xml'
    elif file_path.endswith('.xrdf'):
        file_format = 'xml'
    else:
        raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
    
    g.parse(file_path, format=file_format)

    normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

    classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

    data = []
    relations = []
    found_class_labels = set()

    for cls in classes:
        if isinstance(cls, URIRef):  # Check if the subject is a URI
            labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
            description = get_class_descriptions(g, cls)

            for label in labels:
                if label is not None:
                    normalized_label = normalize_string(label)
                    found_class_labels.add(normalized_label)

                    if normalized_label in normalized_class_names_to_check:
                        # Check if already processed in this iteration
                        if str(cls) not in processed_classes:
                            processed_classes.add(str(cls))
                            data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                        for obj in g.objects(cls, RDFS.subClassOf):
                            if isinstance(obj, URIRef):  # Check if the object is a URI
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                        for obj in g.objects(cls, OWL.equivalentClass):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])                           
                            else:
                                # Handle blank nodes for equivalentClass
                                obj_label = get_complex_expression_label(g, obj)
                                obj_description = "Complex class expression"

                                if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                    # Check for owl:intersectionOf
                                    intersections = list(g.objects(cls, OWL.intersectionOf))
                                    for intersection in intersections:
                                        if isinstance(intersection, BNode):
                                            components = []
                                            for item in g.items(intersection):
                                                if isinstance(item, URIRef):
                                                    component_label = get_class_label(g, item)
                                                    if component_label:
                                                        components.append(component_label)
                                                elif isinstance(item, BNode):
                                                    restriction_labels = []
                                                    for restriction_item in g.items(item):
                                                        if isinstance(restriction_item, BNode):
                                                            restriction_label = get_complex_expression_label(g, restriction_item)
                                                            if restriction_label:
                                                                restriction_labels.append(restriction_label)
                                                    if restriction_labels:
                                                        components.append("Intersection of " + " and ".join(restriction_labels))
                                            
                                            if components:
                                                data.append([file_path, str(cls), "", f"Intersection of {' and '.join(components)}"])

                        for obj in g.objects(cls, DCTERMS.isPartOf):
                            if isinstance(obj, URIRef):
                                obj_label = get_class_label(g, obj)
                                obj_description = get_class_descriptions(g, obj)

                                # Check if already processed in this iteration
                                if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                    processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                    relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

            

    return data, relations, found_class_labels


def filter_relations(all_relations, initial_class_names_to_check):
    normalized_initial_class_names = {normalize_string(name) for name in initial_class_names_to_check}
    return [relation for relation in all_relations if normalize_string(relation[2]) in normalized_initial_class_names or normalize_string(relation[5]) in normalized_initial_class_names]


def print_hierarchy(class_name, relations, g, writer):
    def recursive_print(class_name, depth=0):
        for relation in relations:
            if normalize_string(relation[2]) == normalize_string(class_name):
                subject_description = get_class_descriptions(g, URIRef(relation[1]))
                object_description = get_class_descriptions(g, URIRef(relation[4]))
                writer.writerow([relation[1], relation[2], subject_description if subject_description is not None else "", relation[3], relation[4], relation[5], object_description if object_description is not None else "", relation[0]])
                indent = '  ' * depth
                print(f"{indent}{relation[1]} {relation[2]} is {relation[3]} {relation[4]} {relation} {relation[5]} (from {relation[0]})")
                recursive_print(relation[5], depth + 1)

    recursive_print(class_name)


output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"

all_data = []
all_relations = []
all_found_class_labels = set()

class_names_to_check = initial_class_names_to_check

max_iterations = 2
iteration_count = 0
g = Graph()
processed_classes = set()
processed_relations = set()
last_class_name_written = None  # Track the last class name written


while class_names_to_check and iteration_count < max_iterations:
    iteration_count += 1
    new_data = []
    new_relations = []
    new_found_class_labels = set()

    for ontology_file in ontology_files:
        file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g, processed_classes, processed_relations)
        new_data.extend(file_data)
        new_relations.extend(file_relations)
        new_found_class_labels.update(found_class_labels)

    all_data.extend(new_data)
    all_relations.extend(new_relations)
    all_found_class_labels.update(new_found_class_labels)

    class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
    class_names_to_check = [normalize_string(name) for name in class_names_to_check]

    # Filter and save class data
    filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
    with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_data:
            current_class_name = filtered_data[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_data)

    # Filter and save class relations
    filtered_relations = filter_relations(new_relations, initial_class_names_to_check)
    with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        if filtered_relations:
            current_class_name = filtered_relations[0][2]  # Get the class name from the first row
            if current_class_name != last_class_name_written:
                # writer.writerow(['------'])  # Write separator
                last_class_name_written = current_class_name
        writer.writerows(filtered_relations)




print(f"Filtered class data has been saved to {class_output_file}")
print(f"Filtered class relations have been saved to {relations_output_file}")

print("\nInitial class names found in the output:")
for class_name in initial_class_names_to_check:
    normalized_class_name = normalize_string(class_name)
    found = False
    for label in all_found_class_labels:
        if normalized_class_name in label:
            found = True
            print(f"Class '{class_name}' found in:")
            for ontology_file in ontology_files:
                file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g, processed_classes, processed_relations)
                normalized_labels = [normalize_string(l) for l in found_class_labels]
                if normalized_class_name in normalized_labels:
                    print(f"- {ontology_file}")
            break
    if not found:
        print(f"Class '{class_name}' not found in the output.")

print("\nSaving class hierarchy to CSV file:")
with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(["Class URI", "Class Name", "Class Description", "Relation Type", "Related Class URI", "Related Class Name", "Related Class Description", "File"])
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        print_hierarchy(normalized_class_name, all_relations, g, writer)

print(f"Class hierarchy has been saved to {output_hierarchy_file}")

def save_intersection_info_to_csv(data, output_file):
    with open(output_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
        for row in data:
            writer.writerow(row)

# Inside your main loop where you process ontology files:
intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
save_intersection_info_to_csv(intersection_data, "intersection_info.csv")