In [1]:
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace, Literal

# Define the Turtle data with necessary prefixes
turtle_data = """
@prefix sio: <http://semanticscience.org/resource/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bio: <http://data.bioontology.org/metadata/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://semanticscience.org/resource/Drug> a owl:Class ;
    rdfs:subClassOf <http://semanticscience.org/resource/ChemicalSubstance>, [
        rdf:type owl:Restriction ;
        owl:onProperty <http://semanticscience.org/resource/hasCapability> ;
        owl:someValuesFrom [
            rdf:type owl:Class ;
            owl:intersectionOf (
                <http://semanticscience.org/resource/ToRegulate>
                [
                    rdf:type owl:Restriction ;
                    owl:onProperty <http://semanticscience.org/resource/inRelationTo> ;
                    owl:someValuesFrom <http://semanticscience.org/resource/BiologicalEntity> ;
                ]
            ) ;
        ] ;
    ] ;
    bio:prefixIRI "sio:Drug" ;
    dcterms:description "A drug is a chemical substance that contains one or more active ingredients that regulate one or more biological processes."@en ;
    rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
    rdfs:label "drug"@en .
"""

# Create a Graph and parse the data
g = Graph()
g.parse(data=turtle_data, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Define the resource URI
drug_uri = URIRef("http://semanticscience.org/resource/Drug")

# Function to recursively print blank nodes
def print_blank_node(bnode):
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            print(f"Blank Node Predicate: {p}, Object is another Blank Node")
            print_blank_node(o)
        else:
            print(f"Blank Node Predicate: {p}, Object: {o}")

# Extract Class
for _, _, o in g.triples((drug_uri, RDF.type, None)):
    print(f"Class: {o}")

# Extract SubClass
for _, _, o in g.triples((drug_uri, RDFS.subClassOf, None)):
    if isinstance(o, BNode):
        print("SubClass is a blank node with the following properties:")
        print_blank_node(o)
    else:
        print(f"SubClass: {o}")

# Extract other information
for _, p, o in g.triples((drug_uri, None, None)):
    if p not in {RDF.type, RDFS.subClassOf}:
        print(f"Predicate: {p}, Object: {o}")



Class: http://www.w3.org/2002/07/owl#Class
SubClass: http://semanticscience.org/resource/ChemicalSubstance
SubClass is a blank node with the following properties:
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://www.w3.org/2002/07/owl#Restriction
Blank Node Predicate: http://www.w3.org/2002/07/owl#onProperty, Object: http://semanticscience.org/resource/hasCapability
Blank Node Predicate: http://www.w3.org/2002/07/owl#someValuesFrom, Object is another Blank Node
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://www.w3.org/2002/07/owl#Class
Blank Node Predicate: http://www.w3.org/2002/07/owl#intersectionOf, Object is another Blank Node
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#first, Object: http://semanticscience.org/resource/ToRegulate
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#rest, Object is another Blank Node
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syn

In [2]:
import csv
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace

# Define the Turtle data with necessary prefixes
turtle_data = """
@prefix sio: <http://semanticscience.org/resource/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bio: <http://data.bioontology.org/metadata/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://semanticscience.org/resource/Drug> a owl:Class ;
    rdfs:subClassOf <http://semanticscience.org/resource/ChemicalSubstance>, [
        rdf:type owl:Restriction ;
        owl:onProperty <http://semanticscience.org/resource/hasCapability> ;
        owl:someValuesFrom [
            rdf:type owl:Class ;
            owl:intersectionOf (
                <http://semanticscience.org/resource/ToRegulate>
                [
                    rdf:type owl:Restriction ;
                    owl:onProperty <http://semanticscience.org/resource/inRelationTo> ;
                    owl:someValuesFrom <http://semanticscience.org/resource/BiologicalEntity> ;
                ]
            ) ;
        ] ;
    ] ;
    bio:prefixIRI "sio:Drug" ;
    dcterms:description "A drug is a chemical substance that contains one or more active ingredients that regulate one or more biological processes."@en ;
    rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
    rdfs:label "drug"@en .
"""

# Create a Graph and parse the data
g = Graph()
g.parse(data=turtle_data, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Define the resource URI
drug_uri = URIRef("http://semanticscience.org/resource/Drug")

# Function to recursively extract blank node information
def extract_blank_node(bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            blank_node_data.append((p, "Blank Node"))
            blank_node_data.extend(extract_blank_node(o))
        else:
            blank_node_data.append((p, o))
    return blank_node_data

# Prepare data for CSV
csv_data = [["Subject", "Predicate", "Object"]]

# Extract Class
for _, _, o in g.triples((drug_uri, RDF.type, None)):
    csv_data.append([drug_uri, RDF.type, o])

# Extract SubClass
for _, _, o in g.triples((drug_uri, RDFS.subClassOf, None)):
    if isinstance(o, BNode):
        csv_data.append([drug_uri, RDFS.subClassOf, "Blank Node"])
        csv_data.extend(extract_blank_node(o))
    else:
        csv_data.append([drug_uri, RDFS.subClassOf, o])

# Extract other information
for _, p, o in g.triples((drug_uri, None, None)):
    if p not in {RDF.type, RDFS.subClassOf}:
        csv_data.append([drug_uri, p, o])

# Write data to CSV
with open("drug_info.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("Data saved to drug_info.csv")


Data saved to drug_info.csv


In [3]:
import csv
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace

# Define the Turtle data with necessary prefixes
turtle_data = """
@prefix sio: <http://semanticscience.org/resource/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bio: <http://data.bioontology.org/metadata/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://semanticscience.org/resource/Drug> a owl:Class ;
    rdfs:subClassOf <http://semanticscience.org/resource/ChemicalSubstance>, [
        rdf:type owl:Restriction ;
        owl:onProperty <http://semanticscience.org/resource/hasCapability> ;
        owl:someValuesFrom [
            rdf:type owl:Class ;
            owl:intersectionOf (
                <http://semanticscience.org/resource/ToRegulate>
                [
                    rdf:type owl:Restriction ;
                    owl:onProperty <http://semanticscience.org/resource/inRelationTo> ;
                    owl:someValuesFrom <http://semanticscience.org/resource/BiologicalEntity> ;
                ]
            ) ;
        ] ;
    ] ;
    bio:prefixIRI "sio:Drug" ;
    dcterms:description "A drug is a chemical substance that contains one or more active ingredients that regulate one or more biological processes."@en ;
    rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
    rdfs:label "drug"@en .
"""


# Create a Graph and parse the data
g = Graph()
g.parse(data=turtle_data, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Define the resource URI
drug_uri = URIRef("http://semanticscience.org/resource/Drug")

# Function to get label or URI parts
def get_label_and_uri(uri):
    qres = g.query(
        """
        SELECT ?label WHERE {
            ?uri rdfs:label ?label .
        }
        """,
        initBindings={'uri': uri}
    )
    for row in qres:
        return str(row.label), str(uri)
    return uri.split('/')[-1].split('#')[-1], str(uri)

# Function to recursively extract blank node information
def extract_blank_node(bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            pred_label, pred_uri = get_label_and_uri(p)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
            blank_node_data.extend(extract_blank_node(o))
        else:
            pred_label, pred_uri = get_label_and_uri(p)
            obj_label, obj_uri = get_label_and_uri(o)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
    return blank_node_data

# Prepare data for CSV
csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

# Extract Class
for _, _, o in g.triples((drug_uri, RDF.type, None)):
    subj_label, subj_uri = get_label_and_uri(drug_uri)
    obj_label, obj_uri = get_label_and_uri(o)
    csv_data.append([subj_label, subj_uri, "type", str(RDF.type), obj_label, obj_uri])

# Extract SubClass
for _, _, o in g.triples((drug_uri, RDFS.subClassOf, None)):
    subj_label, subj_uri = get_label_and_uri(drug_uri)
    pred_label, pred_uri = get_label_and_uri(RDFS.subClassOf)
    if isinstance(o, BNode):
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
        csv_data.extend(extract_blank_node(o))
    else:
        obj_label, obj_uri = get_label_and_uri(o)
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

# Extract other information
for _, p, o in g.triples((drug_uri, None, None)):
    if p not in {RDF.type, RDFS.subClassOf}:
        subj_label, subj_uri = get_label_and_uri(drug_uri)
        pred_label, pred_uri = get_label_and_uri(p)
        obj_label, obj_uri = get_label_and_uri(o)
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

# Write data to CSV
with open("drug_info.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("Data saved to drug_info.csv")


Data saved to drug_info.csv


In [4]:
import csv
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace

# Define the path to the Turtle file
turtle_file_path = "../Ontologies/materialsmine_converted.ttl"

# Create a Graph and parse the data from the file
g = Graph()
g.parse(turtle_file_path, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Function to get label or URI parts
def get_label_and_uri(uri):
    qres = g.query(
        """
        SELECT ?label WHERE {
            ?uri rdfs:label ?label .
        }
        """,
        initBindings={'uri': uri}
    )
    for row in qres:
        return str(row.label), str(uri)
    return uri.split('/')[-1].split('#')[-1], str(uri)

# Function to recursively extract blank node information
def extract_blank_node(bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            pred_label, pred_uri = get_label_and_uri(p)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
            blank_node_data.extend(extract_blank_node(o))
        else:
            pred_label, pred_uri = get_label_and_uri(p)
            obj_label, obj_uri = get_label_and_uri(o)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
    return blank_node_data

# Prepare data for CSV
csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

# Extract all triples
for subj, pred, obj in g:
    subj_label, subj_uri = get_label_and_uri(subj)
    pred_label, pred_uri = get_label_and_uri(pred)
    
    if isinstance(obj, BNode):
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
        csv_data.extend(extract_blank_node(obj))
    else:
        obj_label, obj_uri = get_label_and_uri(obj)
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

# Write data to CSV
with open("all_info.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("Data saved to all_info.csv")


In [None]:
#### Reading from the file ####

In [None]:
import csv
import os
import re
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)
            
            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    csv_data.extend(self.extract_blank_node(obj))
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")

# Usage example
turtle_file_path = "../Ontologies/materialsmine_converted.ttl"
class_names = ["drug" , "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

rdf_handler = RDFGraphHandler(turtle_file_path)
rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info.csv


In [None]:
### Improving the speed of info capturing ####

In [None]:
import csv
import os
import re
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    
    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        label_cache = {}

        for subj, pred, obj in self.graph:
            subj_label, subj_uri = label_cache.get(subj, self.get_label_and_uri(subj))
            pred_label, pred_uri = label_cache.get(pred, self.get_label_and_uri(pred))
            normalized_subj_label = self.normalize_string(subj_label)
            
            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    csv_data.extend(self.extract_blank_node(obj))
                    # Set placeholders for obj_label and obj_uri when obj is a BNode
                    obj_label, obj_uri = "Blank Node", "Blank Node"
                else:
                    obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])
            else:
                # Handle the case where normalized_subj_label is not in normalized_class_names
                continue  # Or handle as needed

            # Ensure obj_label and obj_uri are always assigned outside of the conditional
            if not isinstance(obj, BNode):
                obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))


        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")



# Usage example
turtle_file_path = "../Ontologies/materialsmine_converted.ttl"
class_names = ["drug" , "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

rdf_handler = RDFGraphHandler(turtle_file_path)
rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info.csv


In [None]:
#### Implementing multiple input files

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, Namespace


class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    
    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        label_cache = {}

        for subj, pred, obj in self.graph:
            subj_label, subj_uri = label_cache.get(subj, self.get_label_and_uri(subj))
            pred_label, pred_uri = label_cache.get(pred, self.get_label_and_uri(pred))
            normalized_subj_label = self.normalize_string(subj_label)
            
            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    csv_data.extend(self.extract_blank_node(obj))
                    # Set placeholders for obj_label and obj_uri when obj is a BNode
                    obj_label, obj_uri = "Blank Node", "Blank Node"
                else:
                    obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

            # Ensure obj_label and obj_uri are always assigned outside of the conditional
            if not isinstance(obj, BNode):
                obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))

        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")


# Usage example
input_turtle_files = ["../Ontologies/materialsmine_converted.ttl"]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info_2.csv"

for turtle_file_path in input_turtle_files:
    rdf_handler = RDFGraphHandler(turtle_file_path)
    rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info_2.csv


In [None]:
### Using multiprocessing ti speed up the process ###

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    directory = '.'
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            os.remove(os.path.join(directory, filename))


    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes):
        with multiprocessing.Pool() as pool:
            results = pool.map(self.extract_blank_node, bnodes)
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes)
        csv_data.extend(blank_node_data)

        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")

# Usage example
input_turtle_files = ["../Ontologies/materialsmine_converted.ttl", "../Ontologies/emmo.ttl"]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

for turtle_file_path in input_turtle_files:
    rdf_handler = RDFGraphHandler(turtle_file_path)
    rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info_5.csv
Data saved to filtered_info_5.csv


In [None]:
#### The above code work much faster! however when two or more inputs are given then for each inputs the outputs in CSV are removed! ###


In [None]:
### Here I try to solve the above issue ###

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes):
        with multiprocessing.Pool() as pool:
            results = pool.map(self.extract_blank_node, bnodes)
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes)
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_turtle_files = ["../Ontologies/emmo.ttl" , "../Ontologies/materialsmine_converted.ttl"]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Initialize CSV data with headers
all_csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for turtle_file_path in input_turtle_files:
    rdf_handler = RDFGraphHandler(turtle_file_path)
    file_csv_data = rdf_handler.extract_data(class_names)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")



In [None]:
#### Above code down not accept OWL file; Let's fix it ####

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes):
        with multiprocessing.Pool() as pool:
            results = pool.map(self.extract_blank_node, bnodes)
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes)
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_files = [
    "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest', 'Electronic lab Notebook']
output_csv_path = "filtered_info.csv"

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

# Initialize CSV data with headers
all_csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for file_path in input_files:
    rdf_handler = RDFGraphHandler(file_path)
    file_csv_data = rdf_handler.extract_data(class_names)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")


Data saved to filtered_info.csv


In [None]:
#### Let's save the input name to csv file ###

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode, file_name):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o, file_name))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes, file_name):
        with multiprocessing.Pool() as pool:
            results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes, os.path.basename(self.file_path))
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_files = [
    "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest', 'Electronic lab Notebook']
output_csv_path = "filtered_info.csv"

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

# Initialize CSV data with headers
all_csv_data = [["File Name", "Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for file_path in input_files:
    rdf_handler = RDFGraphHandler(file_path)
    file_csv_data = rdf_handler.extract_data(class_names)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")


Data saved to filtered_info.csv


In [None]:
#### Use 8 cores to process ####

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode, file_name):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o, file_name))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes, file_name, num_cores):
        with multiprocessing.Pool(processes=num_cores) as pool:
            results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names, num_cores=8):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes, os.path.basename(self.file_path), num_cores)
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_files = [
    "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest', 'Electronic lab Notebook']
output_csv_path = "filtered_info.csv"
num_cores = 8

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

# Initialize CSV data with headers
all_csv_data = [["File Name", "Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for file_path in input_files:
    rdf_handler = RDFGraphHandler(file_path)
    file_csv_data = rdf_handler.extract_data(class_names, num_cores)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")


Data saved to filtered_info.csv


In [None]:
#### Using Threads ####

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode, file_name):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o, file_name))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes, file_name, num_cores, num_threads):
        if num_threads > 1:
            with ThreadPool(num_threads) as pool:
                results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        else:
            with multiprocessing.Pool(processes=num_cores) as pool:
                results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names, num_cores=8, num_threads=1):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes, os.path.basename(self.file_path), num_cores, num_threads)
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_files = [
    # "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
# class_names = ["drug", "stress", 'Ampere_Per+Joule', 'Tensiletest', 'Electronic lab Notebook']
class_names = ['stress']
output_csv_path = "filtered_info.csv"
num_cores = 4
num_threads = 2

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

# Initialize CSV data with headers
all_csv_data = [["File Name", "Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for file_path in input_files:
    rdf_handler = RDFGraphHandler(file_path)
    file_csv_data = rdf_handler.extract_data(class_names, num_cores, num_threads)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")


Data saved to filtered_info.csv


In [None]:
### Capturing the hierarchy ###

In [None]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode, file_name):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o, file_name))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes, file_name, num_cores, num_threads):
        if num_threads > 1:
            with ThreadPool(num_threads) as pool:
                results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        else:
            with multiprocessing.Pool(processes=num_cores) as pool:
                results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def traverse_class_hierarchy(self, class_uri):
        class_info = []
        query = prepareQuery("""
            SELECT ?subclass ?subclass_label WHERE {
                ?class_uri rdfs:subClassOf+ ?subclass .
                ?subclass rdfs:label ?subclass_label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'class_uri': class_uri})
        for row in qres:
            class_info.append((str(class_uri), str(row.subclass_label), str(row.subclass)))
        return class_info

    def extract_data_with_hierarchy(self, class_names, num_cores=8, num_threads=1):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []
        class_hierarchy_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

                # Extract class hierarchy information
                class_hierarchy_data.extend(self.traverse_class_hierarchy(subj))

        blank_node_data = self.extract_blank_nodes_parallel(bnodes, os.path.basename(self.file_path), num_cores, num_threads)
        csv_data.extend(blank_node_data)
        
        return csv_data, class_hierarchy_data

# Main script to run the RDF processing and CSV writing
if __name__ == "__main__":
    # Usage example
    input_files = [
        # "../Ontologies/MatWerk.xrdf", 
        "../Ontologies/materialsmine_converted.ttl" , 
        # "../Ontologies/emmo.ttl",
        # "../Ontologies/schemaorg.owl"
        ]
    # class_names = ["drug", "stress", 'Ampere_Per+Joule', 'Tensiletest', 'Electronic lab Notebook']
    class_names = ['stress']
    output_csv_path = "filtered_info.csv"
    hierarchy_csv_path = "class_hierarchy_info.csv"
    num_cores = 4
    num_threads = 2

    # Remove existing CSV files if they exist
    for filename in [output_csv_path, hierarchy_csv_path]:
        if os.path.exists(filename):
            os.remove(filename)

    # Initialize CSV data with headers
    all_csv_data = [["File Name", "Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

    # Initialize hierarchy data with headers
    all_hierarchy_data = [["Class URI", "Superclass Label", "Superclass URI"]]

    for file_path in input_files:
        rdf_handler = RDFGraphHandler(file_path)
        file_csv_data, file_hierarchy_data = rdf_handler.extract_data_with_hierarchy(class_names, num_cores, num_threads)
        all_csv_data.extend(file_csv_data)
        all_hierarchy_data.extend(file_hierarchy_data)

    # Write accumulated CSV data to files
    with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(all_csv_data)

    print(f"Data saved to {output_csv_path}")

    # Write hierarchy data to a separate CSV file
    with open(hierarchy_csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(all_hierarchy_data)

    print(f"Class hierarchy data saved to {hierarchy_csv_path}")


Data saved to filtered_info.csv
Class hierarchy data saved to class_hierarchy_info.csv


In [None]:
#### Above code can capture the class hierarchy but in output it is not correctly shown! ####

In [None]:
# Usage example
input_files = [
    # "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
# class_names = ["drug", "stress", 'Ampere_Per+Joule', 'Tensiletest', 'Electronic lab Notebook']
class_names = ['stress']
output_csv_path = "filtered_info.csv"
hierarchy_csv_path = "class_hierarchy_info.csv"
num_cores = 4
num_threads = 2

In [None]:
### Try to solve the above issue ###

In [1]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef, RDF, RDFS
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing
from multiprocessing.dummy import Pool as ThreadPool

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def traverse_class_hierarchy_recursive(self, class_uri, class_label, visited):
        class_hierarchy_data = []

        query = prepareQuery("""
            SELECT ?subclass ?subclass_label WHERE {
                ?class_uri rdfs:subClassOf ?subclass .
                ?subclass rdfs:label ?subclass_label .
            }
            ORDER BY DESC(?subclass)
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'class_uri': class_uri})
        for row in qres:
            subclass_uri = row.subclass
            subclass_label = row.subclass_label

            if subclass_uri not in visited:
                visited.add(subclass_uri)
                # class_hierarchy_data.append((subclass_uri, subclass_label, class_uri, class_label))
                class_hierarchy_data.append((class_label, class_uri, subclass_label, subclass_uri))
                class_hierarchy_data.extend(self.traverse_class_hierarchy_recursive(subclass_uri, subclass_label, visited))

        return class_hierarchy_data

    def extract_class_hierarchy(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        hierarchy_data = []

        for class_uri in self.graph.subjects(RDFS.subClassOf):
            class_label, _ = self.get_label_and_uri(class_uri)
            if self.normalize_string(class_label) in normalized_class_names:
                visited = set()
                hierarchy_data.extend(self.traverse_class_hierarchy_recursive(class_uri, class_label, visited))

        return hierarchy_data

    def extract_data_with_hierarchy(self, class_names, num_cores=1, num_threads=0):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []
        hierarchy_data = self.extract_class_hierarchy(class_names)

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes, os.path.basename(self.file_path), num_cores, num_threads)
        csv_data.extend(blank_node_data)
        
        return csv_data, hierarchy_data

    def extract_blank_node(self, bnode, file_name):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o, file_name))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes, file_name, num_cores, num_threads):
        if num_threads > 1:
            with ThreadPool(num_threads) as pool:
                results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        else:
            with multiprocessing.Pool(processes=num_cores) as pool:
                results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data


if __name__ == "__main__":
    input_files = [
    # "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/nfdicore_2.ttl", 
    # "../Ontologies/fabio.ttl", 
    "../Ontologies/pmdco_core.ttl", 
    "../Ontologies/materialsmine_converted.ttl" , 
    # "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
    class_names = ['stress','drug','AmpereHour+Per_Litre','Electroniclab_Notebook', 'ClampingPressure']  # Adjust class names as needed
    output_csv_path = "filtered_info.csv"
    hierarchy_csv_path = "class_hierarchy_info.csv"
    num_cores = 1
    num_threads = 1

    # Remove existing CSV files if they exist
    for filename in [output_csv_path, hierarchy_csv_path]:
        if os.path.exists(filename):
            os.remove(filename)

    # Initialize CSV data with headers
    all_csv_data = [["File Name", "Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

    # Initialize hierarchy data with headers
    all_hierarchy_data = [["Class Label", "Class URI", "is SubClass of", "Superclass URI"]]

    for file_path in input_files:
        rdf_handler = RDFGraphHandler(file_path)
        file_csv_data, file_hierarchy_data = rdf_handler.extract_data_with_hierarchy(class_names, num_cores, num_threads)
        all_csv_data.extend(file_csv_data)

        # Append hierarchy data to all_hierarchy_data
        all_hierarchy_data.extend(file_hierarchy_data)

    # Write accumulated CSV data to files
    with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(all_csv_data)

    print(f"Data saved to {output_csv_path}")

    # Write hierarchy data to a separate CSV file
    with open(hierarchy_csv_path, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerows(all_hierarchy_data)

    print(f"Class hierarchy data saved to {hierarchy_csv_path}")


Data saved to filtered_info.csv
Class hierarchy data saved to class_hierarchy_info.csv
