In [1]:
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace, Literal

# Define the Turtle data with necessary prefixes
turtle_data = """
@prefix sio: <http://semanticscience.org/resource/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bio: <http://data.bioontology.org/metadata/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://semanticscience.org/resource/Drug> a owl:Class ;
    rdfs:subClassOf <http://semanticscience.org/resource/ChemicalSubstance>, [
        rdf:type owl:Restriction ;
        owl:onProperty <http://semanticscience.org/resource/hasCapability> ;
        owl:someValuesFrom [
            rdf:type owl:Class ;
            owl:intersectionOf (
                <http://semanticscience.org/resource/ToRegulate>
                [
                    rdf:type owl:Restriction ;
                    owl:onProperty <http://semanticscience.org/resource/inRelationTo> ;
                    owl:someValuesFrom <http://semanticscience.org/resource/BiologicalEntity> ;
                ]
            ) ;
        ] ;
    ] ;
    bio:prefixIRI "sio:Drug" ;
    dcterms:description "A drug is a chemical substance that contains one or more active ingredients that regulate one or more biological processes."@en ;
    rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
    rdfs:label "drug"@en .
"""

# Create a Graph and parse the data
g = Graph()
g.parse(data=turtle_data, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Define the resource URI
drug_uri = URIRef("http://semanticscience.org/resource/Drug")

# Function to recursively print blank nodes
def print_blank_node(bnode):
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            print(f"Blank Node Predicate: {p}, Object is another Blank Node")
            print_blank_node(o)
        else:
            print(f"Blank Node Predicate: {p}, Object: {o}")

# Extract Class
for _, _, o in g.triples((drug_uri, RDF.type, None)):
    print(f"Class: {o}")

# Extract SubClass
for _, _, o in g.triples((drug_uri, RDFS.subClassOf, None)):
    if isinstance(o, BNode):
        print("SubClass is a blank node with the following properties:")
        print_blank_node(o)
    else:
        print(f"SubClass: {o}")

# Extract other information
for _, p, o in g.triples((drug_uri, None, None)):
    if p not in {RDF.type, RDFS.subClassOf}:
        print(f"Predicate: {p}, Object: {o}")



Class: http://www.w3.org/2002/07/owl#Class
SubClass: http://semanticscience.org/resource/ChemicalSubstance
SubClass is a blank node with the following properties:
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://www.w3.org/2002/07/owl#Restriction
Blank Node Predicate: http://www.w3.org/2002/07/owl#onProperty, Object: http://semanticscience.org/resource/hasCapability
Blank Node Predicate: http://www.w3.org/2002/07/owl#someValuesFrom, Object is another Blank Node
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#type, Object: http://www.w3.org/2002/07/owl#Class
Blank Node Predicate: http://www.w3.org/2002/07/owl#intersectionOf, Object is another Blank Node
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#first, Object: http://semanticscience.org/resource/ToRegulate
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syntax-ns#rest, Object is another Blank Node
Blank Node Predicate: http://www.w3.org/1999/02/22-rdf-syn

In [2]:
import csv
from rdflib import Graph, URIRef, BNode, RDF, RDFS, OWL, Namespace

# Define the Turtle data with necessary prefixes
turtle_data = """
@prefix sio: <http://semanticscience.org/resource/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bio: <http://data.bioontology.org/metadata/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://semanticscience.org/resource/Drug> a owl:Class ;
    rdfs:subClassOf <http://semanticscience.org/resource/ChemicalSubstance>, [
        rdf:type owl:Restriction ;
        owl:onProperty <http://semanticscience.org/resource/hasCapability> ;
        owl:someValuesFrom [
            rdf:type owl:Class ;
            owl:intersectionOf (
                <http://semanticscience.org/resource/ToRegulate>
                [
                    rdf:type owl:Restriction ;
                    owl:onProperty <http://semanticscience.org/resource/inRelationTo> ;
                    owl:someValuesFrom <http://semanticscience.org/resource/BiologicalEntity> ;
                ]
            ) ;
        ] ;
    ] ;
    bio:prefixIRI "sio:Drug" ;
    dcterms:description "A drug is a chemical substance that contains one or more active ingredients that regulate one or more biological processes."@en ;
    rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
    rdfs:label "drug"@en .
"""

# Create a Graph and parse the data
g = Graph()
g.parse(data=turtle_data, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Define the resource URI
drug_uri = URIRef("http://semanticscience.org/resource/Drug")

# Function to recursively extract blank node information
def extract_blank_node(bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            blank_node_data.append((p, "Blank Node"))
            blank_node_data.extend(extract_blank_node(o))
        else:
            blank_node_data.append((p, o))
    return blank_node_data

# Prepare data for CSV
csv_data = [["Subject", "Predicate", "Object"]]

# Extract Class
for _, _, o in g.triples((drug_uri, RDF.type, None)):
    csv_data.append([drug_uri, RDF.type, o])

# Extract SubClass
for _, _, o in g.triples((drug_uri, RDFS.subClassOf, None)):
    if isinstance(o, BNode):
        csv_data.append([drug_uri, RDFS.subClassOf, "Blank Node"])
        csv_data.extend(extract_blank_node(o))
    else:
        csv_data.append([drug_uri, RDFS.subClassOf, o])

# Extract other information
for _, p, o in g.triples((drug_uri, None, None)):
    if p not in {RDF.type, RDFS.subClassOf}:
        csv_data.append([drug_uri, p, o])

# Write data to CSV
with open("drug_info.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("Data saved to drug_info.csv")


Data saved to drug_info.csv


In [1]:
import csv
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace

# Define the Turtle data with necessary prefixes
turtle_data = """
@prefix sio: <http://semanticscience.org/resource/> .
@prefix dcterms: <http://purl.org/dc/terms/> .
@prefix bio: <http://data.bioontology.org/metadata/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .

<http://semanticscience.org/resource/Drug> a owl:Class ;
    rdfs:subClassOf <http://semanticscience.org/resource/ChemicalSubstance>, [
        rdf:type owl:Restriction ;
        owl:onProperty <http://semanticscience.org/resource/hasCapability> ;
        owl:someValuesFrom [
            rdf:type owl:Class ;
            owl:intersectionOf (
                <http://semanticscience.org/resource/ToRegulate>
                [
                    rdf:type owl:Restriction ;
                    owl:onProperty <http://semanticscience.org/resource/inRelationTo> ;
                    owl:someValuesFrom <http://semanticscience.org/resource/BiologicalEntity> ;
                ]
            ) ;
        ] ;
    ] ;
    bio:prefixIRI "sio:Drug" ;
    dcterms:description "A drug is a chemical substance that contains one or more active ingredients that regulate one or more biological processes."@en ;
    rdfs:isDefinedBy <http://semanticscience.org/ontology/sio/v1.53/sio-subset-labels.owl> ;
    rdfs:label "drug"@en .
"""


# Create a Graph and parse the data
g = Graph()
g.parse(data=turtle_data, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Define the resource URI
drug_uri = URIRef("http://semanticscience.org/resource/Drug")

# Function to get label or URI parts
def get_label_and_uri(uri):
    qres = g.query(
        """
        SELECT ?label WHERE {
            ?uri rdfs:label ?label .
        }
        """,
        initBindings={'uri': uri}
    )
    for row in qres:
        return str(row.label), str(uri)
    return uri.split('/')[-1].split('#')[-1], str(uri)

# Function to recursively extract blank node information
def extract_blank_node(bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            pred_label, pred_uri = get_label_and_uri(p)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
            blank_node_data.extend(extract_blank_node(o))
        else:
            pred_label, pred_uri = get_label_and_uri(p)
            obj_label, obj_uri = get_label_and_uri(o)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
    return blank_node_data

# Prepare data for CSV
csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

# Extract Class
for _, _, o in g.triples((drug_uri, RDF.type, None)):
    subj_label, subj_uri = get_label_and_uri(drug_uri)
    obj_label, obj_uri = get_label_and_uri(o)
    csv_data.append([subj_label, subj_uri, "type", str(RDF.type), obj_label, obj_uri])

# Extract SubClass
for _, _, o in g.triples((drug_uri, RDFS.subClassOf, None)):
    subj_label, subj_uri = get_label_and_uri(drug_uri)
    pred_label, pred_uri = get_label_and_uri(RDFS.subClassOf)
    if isinstance(o, BNode):
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
        csv_data.extend(extract_blank_node(o))
    else:
        obj_label, obj_uri = get_label_and_uri(o)
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

# Extract other information
for _, p, o in g.triples((drug_uri, None, None)):
    if p not in {RDF.type, RDFS.subClassOf}:
        subj_label, subj_uri = get_label_and_uri(drug_uri)
        pred_label, pred_uri = get_label_and_uri(p)
        obj_label, obj_uri = get_label_and_uri(o)
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

# Write data to CSV
with open("drug_info.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("Data saved to drug_info.csv")


Data saved to drug_info.csv


In [4]:
import csv
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace

# Define the path to the Turtle file
turtle_file_path = "../Ontologies/materialsmine_converted.ttl"

# Create a Graph and parse the data from the file
g = Graph()
g.parse(turtle_file_path, format="turtle")

# Define the necessary namespaces
SIO = Namespace("http://semanticscience.org/resource/")
DC = Namespace("http://purl.org/dc/terms/")
BIO = Namespace("http://data.bioontology.org/metadata/")
OWL = Namespace("http://www.w3.org/2002/07/owl#")
RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")

# Bind prefixes to the graph (optional but useful for serialization and querying)
g.bind("sio", SIO)
g.bind("dcterms", DC)
g.bind("bio", BIO)
g.bind("owl", OWL)
g.bind("rdfs", RDFS)
g.bind("rdf", RDF)

# Function to get label or URI parts
def get_label_and_uri(uri):
    qres = g.query(
        """
        SELECT ?label WHERE {
            ?uri rdfs:label ?label .
        }
        """,
        initBindings={'uri': uri}
    )
    for row in qres:
        return str(row.label), str(uri)
    return uri.split('/')[-1].split('#')[-1], str(uri)

# Function to recursively extract blank node information
def extract_blank_node(bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            pred_label, pred_uri = get_label_and_uri(p)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
            blank_node_data.extend(extract_blank_node(o))
        else:
            pred_label, pred_uri = get_label_and_uri(p)
            obj_label, obj_uri = get_label_and_uri(o)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
    return blank_node_data

# Prepare data for CSV
csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

# Extract all triples
for subj, pred, obj in g:
    subj_label, subj_uri = get_label_and_uri(subj)
    pred_label, pred_uri = get_label_and_uri(pred)
    
    if isinstance(obj, BNode):
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
        csv_data.extend(extract_blank_node(obj))
    else:
        obj_label, obj_uri = get_label_and_uri(obj)
        csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

# Write data to CSV
with open("all_info.csv", mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(csv_data)

print("Data saved to all_info.csv")


Data saved to all_info.csv


In [None]:
#### Reading from the file ####

In [3]:
import csv
import os
import re
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)
            
            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    csv_data.extend(self.extract_blank_node(obj))
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")

# Usage example
turtle_file_path = "../Ontologies/materialsmine_converted.ttl"
class_names = ["drug" , "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

rdf_handler = RDFGraphHandler(turtle_file_path)
rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info.csv


In [4]:
### Improving the speed of info capturing ####

In [1]:
import csv
import os
import re
from rdflib import Graph, URIRef, BNode, RDF, RDFS, Namespace


directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    
    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        label_cache = {}

        for subj, pred, obj in self.graph:
            subj_label, subj_uri = label_cache.get(subj, self.get_label_and_uri(subj))
            pred_label, pred_uri = label_cache.get(pred, self.get_label_and_uri(pred))
            normalized_subj_label = self.normalize_string(subj_label)
            
            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    csv_data.extend(self.extract_blank_node(obj))
                    # Set placeholders for obj_label and obj_uri when obj is a BNode
                    obj_label, obj_uri = "Blank Node", "Blank Node"
                else:
                    obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])
            else:
                # Handle the case where normalized_subj_label is not in normalized_class_names
                continue  # Or handle as needed

            # Ensure obj_label and obj_uri are always assigned outside of the conditional
            if not isinstance(obj, BNode):
                obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))


        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")



# Usage example
turtle_file_path = "../Ontologies/materialsmine_converted.ttl"
class_names = ["drug" , "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

rdf_handler = RDFGraphHandler(turtle_file_path)
rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info.csv


In [None]:
#### Implementing multiple input files

In [1]:
import csv
import os
import re
from rdflib import Graph, BNode, Namespace


class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    
    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        label_cache = {}

        for subj, pred, obj in self.graph:
            subj_label, subj_uri = label_cache.get(subj, self.get_label_and_uri(subj))
            pred_label, pred_uri = label_cache.get(pred, self.get_label_and_uri(pred))
            normalized_subj_label = self.normalize_string(subj_label)
            
            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    csv_data.extend(self.extract_blank_node(obj))
                    # Set placeholders for obj_label and obj_uri when obj is a BNode
                    obj_label, obj_uri = "Blank Node", "Blank Node"
                else:
                    obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

            # Ensure obj_label and obj_uri are always assigned outside of the conditional
            if not isinstance(obj, BNode):
                obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))

        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")


# Usage example
input_turtle_files = ["../Ontologies/materialsmine_converted.ttl"]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info_2.csv"

for turtle_file_path in input_turtle_files:
    rdf_handler = RDFGraphHandler(turtle_file_path)
    rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info_2.csv


In [2]:
### Using multiprocessing ti speed up the process ###

In [3]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    directory = '.'
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            os.remove(os.path.join(directory, filename))


    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes):
        with multiprocessing.Pool() as pool:
            results = pool.map(self.extract_blank_node, bnodes)
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes)
        csv_data.extend(blank_node_data)

        with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerows(csv_data)

        print(f"Data saved to {output_csv_path}")

# Usage example
input_turtle_files = ["../Ontologies/materialsmine_converted.ttl", "../Ontologies/emmo.ttl"]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

for turtle_file_path in input_turtle_files:
    rdf_handler = RDFGraphHandler(turtle_file_path)
    rdf_handler.save_to_csv(class_names, output_csv_path)


Data saved to filtered_info_5.csv
Data saved to filtered_info_5.csv


In [4]:
#### The above code work much faster! however when two or more inputs are given then for each inputs the outputs in CSV are removed! ###


In [5]:
### Here I try to solve the above issue ###

In [6]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes):
        with multiprocessing.Pool() as pool:
            results = pool.map(self.extract_blank_node, bnodes)
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes)
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_turtle_files = ["../Ontologies/emmo.ttl" , "../Ontologies/materialsmine_converted.ttl"]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest']
output_csv_path = "filtered_info.csv"

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))


# Initialize CSV data with headers
all_csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for turtle_file_path in input_turtle_files:
    rdf_handler = RDFGraphHandler(turtle_file_path)
    file_csv_data = rdf_handler.extract_data(class_names)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")



In [None]:
#### Above code down not accept OWL file; Let's fix it ####

In [4]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes):
        with multiprocessing.Pool() as pool:
            results = pool.map(self.extract_blank_node, bnodes)
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes)
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_files = [
    "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest', 'Electronic lab Notebook']
output_csv_path = "filtered_info.csv"

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

# Initialize CSV data with headers
all_csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for file_path in input_files:
    rdf_handler = RDFGraphHandler(file_path)
    file_csv_data = rdf_handler.extract_data(class_names)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")


Data saved to filtered_info.csv


In [5]:
#### Let's save the input name to csv file ###

In [6]:
import csv
import os
import re
from rdflib import Graph, BNode, URIRef
from rdflib.plugins.sparql import prepareQuery
from rdflib import Namespace
import multiprocessing

class RDFGraphHandler:

    def __init__(self, file_path):
        self.file_path = file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.label_cache = {}
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        file_format = 'turtle' if self.file_path.endswith('.ttl') else 'xml'
        self.graph.parse(self.file_path, format=file_format)

    def get_label_and_uri(self, uri):
        if uri in self.label_cache:
            return self.label_cache[uri]

        query = prepareQuery("""
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initNs={"rdfs": self.RDFS}
        )
        qres = self.graph.query(query, initBindings={'uri': uri})
        for row in qres:
            result = (str(row.label), str(uri))
            self.label_cache[uri] = result
            return result
        
        result = (uri.split('/')[-1].split('#')[-1], str(uri))
        self.label_cache[uri] = result
        return result

    def extract_blank_node(self, bnode, file_name):
        blank_node_data = []
        for p, o in self.graph.predicate_objects(bnode):
            if isinstance(o, BNode):
                pred_label, pred_uri = self.get_label_and_uri(p)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                blank_node_data.extend(self.extract_blank_node(o, file_name))
            else:
                pred_label, pred_uri = self.get_label_and_uri(p)
                obj_label, obj_uri = self.get_label_and_uri(o)
                blank_node_data.append((file_name, "Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
        return blank_node_data

    def extract_blank_nodes_parallel(self, bnodes, file_name):
        with multiprocessing.Pool() as pool:
            results = pool.starmap(self.extract_blank_node, [(bnode, file_name) for bnode in bnodes])
        blank_node_data = [item for sublist in results for item in sublist]
        return blank_node_data

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def extract_data(self, class_names):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = []

        bnodes = []
        for subj, pred, obj in self.graph:
            subj_label, subj_uri = self.get_label_and_uri(subj)
            pred_label, pred_uri = self.get_label_and_uri(pred)
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    bnodes.append(obj)
                else:
                    obj_label, obj_uri = self.get_label_and_uri(obj)
                    csv_data.append([os.path.basename(self.file_path), subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        blank_node_data = self.extract_blank_nodes_parallel(bnodes, os.path.basename(self.file_path))
        csv_data.extend(blank_node_data)
        
        return csv_data

# Usage example
input_files = [
    "../Ontologies/MatWerk.xrdf", 
    "../Ontologies/materialsmine_converted.ttl" , 
    "../Ontologies/emmo.ttl",
    # "../Ontologies/schemaorg.owl"
    ]
class_names = ["drug", "stress", 'AmperePerJoule', 'Tensiletest', 'Electronic lab Notebook']
output_csv_path = "filtered_info.csv"

# Remove existing CSV file if it exists
directory = '.'
for filename in os.listdir(directory):
    if filename.endswith(".csv"):
        os.remove(os.path.join(directory, filename))

# Initialize CSV data with headers
all_csv_data = [["File Name", "Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

for file_path in input_files:
    rdf_handler = RDFGraphHandler(file_path)
    file_csv_data = rdf_handler.extract_data(class_names)
    all_csv_data.extend(file_csv_data)

# Write accumulated CSV data to file
with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.writer(file)
    writer.writerows(all_csv_data)

print(f"Data saved to {output_csv_path}")


Data saved to filtered_info.csv


In [18]:
#### Using RDF lib parsing to improvment ####

In [None]:
#### Using multiprocessing ####

In [15]:
import csv
import os
import re
import multiprocessing
from multiprocessing import Manager, Pool
from rdflib import Graph, BNode, Namespace

class RDFGraphHandler:
    def __init__(self, turtle_file_path):
        self.turtle_file_path = turtle_file_path
        self.graph = Graph()
        self.SIO = Namespace("http://semanticscience.org/resource/")
        self.DC = Namespace("http://purl.org/dc/terms/")
        self.BIO = Namespace("http://data.bioontology.org/metadata/")
        self.OWL = Namespace("http://www.w3.org/2002/07/owl#")
        self.RDFS = Namespace("http://www.w3.org/2000/01/rdf-schema#")
        self.RDF = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
        self.bind_namespaces()
        self.parse_graph()

    def bind_namespaces(self):
        self.graph.bind("sio", self.SIO)
        self.graph.bind("dcterms", self.DC)
        self.graph.bind("bio", self.BIO)
        self.graph.bind("owl", self.OWL)
        self.graph.bind("rdfs", self.RDFS)
        self.graph.bind("rdf", self.RDF)

    def parse_graph(self):
        self.graph.parse(self.turtle_file_path, format="turtle")

    def get_label_and_uri(self, uri):
        qres = self.graph.query(
            """
            SELECT ?label WHERE {
                ?uri rdfs:label ?label .
            }
            """,
            initBindings={'uri': uri}
        )
        for row in qres:
            return str(row.label), str(uri)
        return uri.split('/')[-1].split('#')[-1], str(uri)

    def extract_blank_node(self, bnode, label_cache):
        def process_node(bnode):
            blank_node_data = []
            for p, o in self.graph.predicate_objects(bnode):
                if isinstance(o, BNode):
                    pred_label, pred_uri = self.get_label_and_uri(p)
                    blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
                    blank_node_data.extend(self.extract_blank_node(o, label_cache))
                else:
                    pred_label, pred_uri = self.get_label_and_uri(p)
                    obj_label, obj_uri = self.get_label_and_uri(o)
                    blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
            return blank_node_data
        
        return process_node(bnode)

    def normalize_string(self, s):
        return re.sub(r'[^a-z0-9]', '', s.lower())

    def process_chunk(self, chunk, normalized_class_names, label_cache):
        chunk_data = []
        for subj, pred, obj in chunk:
            subj_label, subj_uri = label_cache.get(subj, self.get_label_and_uri(subj))
            pred_label, pred_uri = label_cache.get(pred, self.get_label_and_uri(pred))
            normalized_subj_label = self.normalize_string(subj_label)

            if normalized_subj_label in normalized_class_names:
                if isinstance(obj, BNode):
                    chunk_data.append([subj_label, subj_uri, pred_label, pred_uri, "Blank Node", "Blank Node"])
                    chunk_data.extend(self.extract_blank_node(obj, label_cache))
                else:
                    obj_label, obj_uri = label_cache.get(obj, self.get_label_and_uri(obj))
                    chunk_data.append([subj_label, subj_uri, pred_label, pred_uri, obj_label, obj_uri])

        return chunk_data

    def save_to_csv(self, class_names, output_csv_path):
        normalized_class_names = {self.normalize_string(name) for name in class_names}
        csv_data = [["Subject", "Subject URI", "Predicate", "Predicate URI", "Object", "Object URI"]]

        manager = Manager()
        label_cache = manager.dict()

        # Determine chunk size (you can adjust this as needed)
        chunk_size = 1000
        chunks = [list(self.graph[i:i + chunk_size]) for i in range(0, len(self.graph), chunk_size)]

        # Use multiprocessing Pool to parallelize processing of chunks
        with Pool() as pool:
            chunk_results = pool.starmap(self.process_chunk, [(chunk, normalized_class_names, label_cache) for chunk in chunks])

        for result in chunk_results:
            csv_data.extend(result)

        # Write data to CSV file
        try:
            with open(output_csv_path, mode="w", newline="", encoding="utf-8") as file:
                writer = csv.writer(file)
                writer.writerows(csv_data)
            print(f"Data saved to {output_csv_path}")
        except Exception as e:
            print(f"Error saving data to {output_csv_path}: {e}")

# Usage example


In [16]:
# Usage example
if __name__ == "__main__":
    turtle_file_path = "../Ontologies/materialsmine_converted.ttl"
    class_names = ["drug", "stress", "AmperePerJoule", "Tensiletest"]
    output_csv_path = "filtered_info_opti.csv"

    rdf_handler = RDFGraphHandler(turtle_file_path)
    rdf_handler.save_to_csv(class_names, output_csv_path)

Data saved to filtered_info_opti.csv


In [19]:
### The above code does not print anyting in CSV file! ###

In [None]:
#### Try to combine the above way to previous one on v4 ####

In [1]:
import csv
import re
import os
from rdflib import Graph, RDF, RDFS, OWL, SKOS, Namespace, URIRef, Literal, BNode

# Define namespaces (assuming these are already defined in your script)...
ex = Namespace("http://example.org/ontology/")
sio = Namespace("http://semanticscience.org/resource/")
skos = Namespace("http://www.w3.org/2004/02/skos/core#")
owl = Namespace("http://www.w3.org/2002/07/owl#")
rdfs = Namespace("http://www.w3.org/2000/01/rdf-schema#")
materialsmine = Namespace("http://materialsmine.org/ns/")
bibo = Namespace("http://purl.org/ontology/bibo/")
rdf = Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#")
xsd = Namespace("http://www.w3.org/2001/XMLSchema#")
xml = Namespace("http://www.w3.org/XML/1998/namespace")
foaf = Namespace("http://xmlns.com/foaf/0.1/")
dcterms = Namespace("http://purl.org/dc/terms/")
isPartOf = dcterms.isPartOf
DCTERMS = Namespace("http://purl.org/dc/terms/")

def normalize_string(s):
    s = s.lower()
    s = re.sub(r'[_\-+\s]+', '', s)
    s = s.replace('...', '')
    return s

# def get_label_and_uri(g, uri):
#     qres = g.query(
#         """
#         SELECT ?label WHERE {
#             ?uri rdfs:label ?label .
#         }
#         """,
#         initBindings={'uri': uri}
#     )
#     for row in qres:
#         return str(row.label), str(uri)
#     return uri.split('/')[-1].split('#')[-1], str(uri)

def extract_blank_node(g, bnode):
    blank_node_data = []
    for p, o in g.predicate_objects(bnode):
        if isinstance(o, BNode):
            pred_label, pred_uri = get_label_and_uri(g, p)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, "Blank Node", pred_uri))
            blank_node_data.extend(extract_blank_node(g, o))
        else:
            pred_label, pred_uri = get_label_and_uri(g, p)
            obj_label, obj_uri = get_label_and_uri(g, o)
            blank_node_data.append(("Blank Node", "Blank Node", pred_label, obj_label, pred_uri, obj_uri))
    return blank_node_data

def process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file):
    directory = '.'
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            os.remove(os.path.join(directory, filename))

    g = Graph()
    processed_classes = set()
    processed_relations = set()

    def load_and_collect_classes_and_relations(file_path, class_names_to_check, g):
        if file_path.endswith('.ttl'):
            file_format = 'ttl'
        elif file_path.endswith('.owl'):
            file_format = 'xml'
        elif file_path.endswith('.xrdf'):
            file_format = 'xml'
        else:
            raise ValueError("Unsupported file format. Only .ttl and .owl files are supported.")
        
        g.parse(file_path, format=file_format)

        normalized_class_names_to_check = {normalize_string(name) for name in class_names_to_check}

        classes = set(g.subjects(RDF.type, OWL.Class)).union(g.subjects(RDF.type, RDFS.Class))

        data = []
        relations = []
        found_class_labels = set()

        for cls in classes:
            if isinstance(cls, URIRef):  # Check if the subject is a URI
                labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
                description = get_class_descriptions(g, cls)

                for label in labels:
                    if label is not None:
                        normalized_label = normalize_string(label)
                        found_class_labels.add(normalized_label)

                        if normalized_label in normalized_class_names_to_check:
                            # Check if already processed in this iteration
                            if str(cls) not in processed_classes:
                                processed_classes.add(str(cls))
                                data.append([file_path, str(cls), str(label), str(description) if description is not None else ""])

                            for obj in g.objects(cls, RDFS.subClassOf):
                                if isinstance(obj, URIRef):  # Check if the object is a URI
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'subClassOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'subClassOf', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'subClassOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                            for obj in g.objects(cls, OWL.equivalentClass):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'equivalentClass', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])
                                else:
                                    # Handle blank nodes for equivalentClass
                                    obj_label = get_complex_expression_label(g, obj)
                                    obj_description = "Complex class expression"

                                    if (str(cls), 'equivalentClass', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'equivalentClass', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'equivalentClass', "Complex class expression", str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

                            for obj in g.objects(cls, DCTERMS.isPartOf):
                                if isinstance(obj, URIRef):
                                    obj_label = get_class_label(g, obj)
                                    obj_description = get_class_descriptions(g, obj)

                                    # Check if already processed in this iteration
                                    if (str(cls), 'isPartOf', str(obj)) not in processed_relations:
                                        processed_relations.add((str(cls), 'isPartOf', str(obj)))
                                        relations.append([file_path, str(cls), str(label), 'isPartOf', str(obj), str(obj_label) if obj_label is not None else "", str(obj_description) if obj_description is not None else ""])

        return data, relations, found_class_labels

    def get_class_label(g, cls):
        labels = list(g.objects(cls, SKOS.altLabel)) + list(g.objects(cls, SKOS.prefLabel)) + list(g.objects(cls, RDFS.label))
        return labels[0] if labels else None

    def get_class_descriptions(g, cls):
        descriptions = list(g.objects(cls, DCTERMS.description)) + list(g.objects(cls, SKOS.definition)) + list(g.objects(cls, RDFS.comment))
        return " ".join([str(desc) for desc in descriptions if desc is not None]) if descriptions else None

    def get_complex_expression_label(g, node):
        if (node, RDF.type, OWL.Restriction) in g:
            prop = list(g.objects(node, OWL.onProperty))
            val = list(g.objects(node, OWL.someValuesFrom))
            if prop and val:
                prop_label = get_class_label(g, prop[0])
                val_label = get_class_label(g, val[0])
                return f"Restriction on {prop_label} some {val_label}"
        elif (node, RDF.type, OWL.Class) in g:
            intersection = list(g.objects(node, OWL.intersectionOf))
            if intersection:
                components = []
                for item in g.items(intersection[0]):
                    if isinstance(item, URIRef):
                        component_label = get_class_label(g, item)
                        if component_label:
                            components.append(component_label)
                    elif isinstance(item, BNode):
                        restriction_label = get_complex_expression_label(g, item)
                        if restriction_label:
                            components.append(restriction_label)
                if components:
                    return f"Intersection of {' and '.join(components)}"
        return None

    g = Graph()
    all_data = []
    all_relations = []
    all_found_class_labels = set()
    max_iterations = 2
    iteration_count = 0

    class_names_to_check = initial_class_names_to_check

    while class_names_to_check and iteration_count < max_iterations:
        iteration_count += 1
        new_data = []
        new_relations = []
        new_found_class_labels = set()

        for ontology_file in ontology_files:
            file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, class_names_to_check, g)
            new_data.extend(file_data)
            new_relations.extend(file_relations)
            new_found_class_labels.update(found_class_labels)

        all_data.extend(new_data)
        all_relations.extend(new_relations)
        all_found_class_labels.update(new_found_class_labels)

        class_names_to_check = {str(label) for label in new_found_class_labels} - {normalize_string(name) for name in initial_class_names_to_check}
        class_names_to_check = [normalize_string(name) for name in class_names_to_check]

        # Filter and save class data
        filtered_data = [row for row in new_data if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
        with open(class_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_data:
                current_class_name = filtered_data[0][2]  # Get the class name from the first row
                writer.writerows(filtered_data)

        # Filter and save class relations
        filtered_relations = [row for row in new_relations if normalize_string(row[2]) in {normalize_string(name) for name in initial_class_names_to_check}]
        with open(relations_output_file, mode='a', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            if filtered_relations:
                current_class_name = filtered_relations[0][2]  # Get the class name from the first row
                writer.writerows(filtered_relations)

    # Save all found classes and relations
    with open(output_hierarchy_file, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["File", "Subject Class URI", "Subject Class Name", "Predicate", "Object Class URI", "Object Class Name"])
        for relation in all_relations:
            writer.writerow(relation)

    print(f"Filtered class data has been saved to {class_output_file}")
    print(f"Filtered class relations have been saved to {relations_output_file}")

    print("\nInitial class names found in the output:")
    for class_name in initial_class_names_to_check:
        normalized_class_name = normalize_string(class_name)
        found = False
        for label in all_found_class_labels:
            if normalized_class_name in label:
                found = True
                print(f"Class '{class_name}' found in:")
                for ontology_file in ontology_files:
                    file_data, file_relations, found_class_labels = load_and_collect_classes_and_relations(ontology_file, initial_class_names_to_check, g)
                    normalized_labels = [normalize_string(l) for l in found_class_labels]
                    if normalized_class_name in normalized_labels:
                        print(f"- {ontology_file}")
                break
        if not found:
            print(f"Class '{class_name}' not found in the output.")

    print(f"\nClass hierarchy has been saved to {output_hierarchy_file}")

    def save_intersection_info_to_csv(data, output_file):
        with open(output_file, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["File", "Class URI", "Class Name", "Intersection Description"])
            writer.writerows(data)

    # Inside your main loop where you process ontology files:
    intersection_data = [row for row in all_data if row[3].startswith("Intersection of")]
    save_intersection_info_to_csv(intersection_data, "intersection_info.csv")


# Example usage
ontology_files = [
    # "../Ontologies/materialsmine.ttl", ### is not complete!
    "../Ontologies/materialsmine_converted.ttl",
    "../Ontologies/pmdco_core.ttl",
    # "../Ontologies/nfdicore_2.ttl",
    # "../Ontologies/bfo.owl", #### using this ---->  long time to proccess!
    "../Ontologies/emmo.ttl",
    # "../Ontologies/owlapi.xrdf",
    # "../Ontologies/schemaorg.owl",
    # "../Ontologies/MaterialsMine.xrdf",
    # '../Ontologies/emmo.owl', ### has problem of reading file
    # "../Ontologies/Physical_Activity_Ontology_V2.owl",
    # "../Ontologies/Physical_Activity_Ontology_V2.xrdf",
    # "../Ontologies/oboe.owl",
    # "../Ontologies/fabio.ttl",
]


initial_class_names_to_check = ["drug" , "stress", 'AmperePerJoule', 'Tensiletest']


# ontology_files = ["../Ontologies/materialsmine_converted.ttl"]
# initial_class_names_to_check = ["Drug"]



output_hierarchy_file = "class_hierarchy.csv"
class_output_file = "ontology_classes.csv"
relations_output_file = "ontology_relations.csv"



In [2]:
process_ontology(ontology_files, initial_class_names_to_check, output_hierarchy_file, class_output_file, relations_output_file)


Filtered class data has been saved to ontology_classes.csv
Filtered class relations have been saved to ontology_relations.csv

Initial class names found in the output:
Class 'drug' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/emmo.ttl
Class 'stress' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/emmo.ttl
Class 'AmperePerJoule' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/emmo.ttl
Class 'Tensiletest' found in:
- ../Ontologies/materialsmine_converted.ttl
- ../Ontologies/pmdco_core.ttl
- ../Ontologies/emmo.ttl

Class hierarchy has been saved to class_hierarchy.csv


In [3]:
#### Above code does not capture class hierarchy correctly #####