In [1]:
import pandas as pds
import rdflib
from rdflib import URIRef, BNode, Literal, Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql.processor import SPARQLResult
from SPARQLWrapper import SPARQLWrapper
from pandasql import sqldf
from typing import Optional, List, Any
pysqldf = lambda q: sqldf(q, globals())

## load data

In [2]:
patients = pds.read_csv('../data/patients.csv')
providers = pds.read_csv('../data/providers.csv')
procedures = pds.read_csv('../data/procedures.csv')

test with simple sql query

In [3]:
pysqldf('select * from procedures')

Unnamed: 0,proc_id,proc_date,patient_id,provider_id,tooth_num,proc_code
0,1,2020-01-01,1001,1,1,d2300
1,2,2020-02-01,1002,2,2,d2400
2,3,2020-03-01,1003,3,3,d2500
3,4,2020-04-01,1004,1,4,d2600
4,5,2020-05-01,1005,2,5,d2700
5,6,2020-06-01,1006,3,6,d2800


## helper functions
* add_spo: shortcut for add subject, predicate, object URIRefs to graph
* add_spo: shortcut for add subject, predicate URIRefs and literal value to graph
* sparql_results_to_df: converts sparql results into a dataframe
* sparql_query_to_df: queries the graph and returns the results as a dataframe
* add_table_metadata_to_graph: adds instances of tables and fields to graph
* add_enums_to_graph: adds instances of enumerated values to graph

In [4]:
def add_spo(graph: Graph, subj: Any, predicate: Any, obj: Any) -> Graph:
    graph.add((URIRef(subj), URIRef(predicate), URIRef(obj)))
    return graph

In [5]:
def add_spv(graph: Graph, subj: Any, predicate: Any, val: Any) -> Graph:
    graph.add((URIRef(subj), URIRef(predicate), Literal(val)))
    return graph

In [6]:
def sparql_results_to_df(results: SPARQLResult, graph=Optional[Graph]) -> pds.DataFrame:
    def set_value(x):
        if x is None:
            return None
        elif graph is not None:
            for n in graph.namespaces():
                # each is a tuple of form (<prefix>, URIRef(...))
                # e.g., ('dc', rdflib.term.URIRef('http://purl.org/dc/elements/1.1/'))
                if str(x).startswith(str(n[1])):
                    # replace uri with prefix
                    return str(x).replace(n[1], n[0])
                
            # if it makes it here, no replacements occurred
            return x.toPython()
        else:
            return x.toPython()

    return \
        pds.DataFrame(
            data=([set_value(x) for x in row] for row in results),
            columns=[str(x) for x in results.vars]
        )

In [7]:
def sparql_query_to_df(query: str, graph: Graph, use_ns=True) -> pds.DataFrame:
    results = graph.query(query)
    if use_ns:
        return sparql_results_to_df(results, graph)
    else:
        return sparql_results_to_df(results, None)

In [8]:
def add_table_metadata_to_graph(table: pds.DataFrame, 
                                table_name: str, 
                                graph: Graph, 
                                table_ns: Namespace, 
                                field_ns: Namespace, 
                                property_ns: Namespace) -> Graph:
    # add table instance to graph
    table_uri = table_ns[f'/{table_name}']
    graph = add_spo(graph, table_uri, RDF.type, table_ns)
    graph = add_spv(graph, table_uri, RDFS.label, table_name)
        
    # add each of the tables fields to graph as instances of fields
    for field_name in table.columns:
        field_name = f'{table_name}.{field_name}' # prepend table name to field name
        uri = URIRef(field_ns[f'/{field_name}'])
        graph = add_spo(graph, uri, RDF.type, field_ns)
        grpah = add_spo(graph, uri, property_ns.member_of, table_uri)
        graph = add_spv(graph, uri, RDFS.label, field_name)
    
    return graph

In [9]:
def add_enums_to_graph(enums: List, 
                       table_name: str, 
                       field_name: str, 
                       graph: Graph, 
                       enum_ns: Namespace, 
                       base_ns: Namespace) -> Graph:

    field_name = f'{table_name}.{field_name}' # prepend table name to field name
    for enum in enums:
        # build uri
        uri = URIRef(enum_ns[f'/{field_name}#{enum}'])

        # add instances
        # Note: the literal value is added to the graph as well
        graph = add_spo(graph, uri, RDF.type, enum_ns)
        graph = add_spv(graph, uri, base_ns.has_value, enum)
        graph = add_spv(graph, uri, RDFS.label, f'{field_name} {enum}')

        # enums constrain values in fields, so add this informaton the graph
        field = base_ns[f'/field/{field_name}']
        graph = add_spo(graph, uri, base_ns.defines_values_in, field)
        
    return graph

## load ontology

In [10]:
g = Graph()
g.parse('../ontology/data-field-punning.ttl', format='ttl')
g.bind(":", Namespace("https://data-field-punning.owl/")) # you can also use g.namespace_manager.bind(...)
g.bind("field:", Namespace("https://data-field-punning.owl/field/"))
g.bind("table:", Namespace("https://data-field-punning.owl/table/"))
g.bind("enum:", Namespace("https://data-field-punning.owl/enumerated_value/"))

add some namespaces to use as shortcuts

In [11]:
ns = Namespace("https://data-field-punning.owl/")
field_ns = Namespace(ns.field)
table_ns = Namespace(ns.table)
enum_ns = Namespace(ns.enumerated_value)

test simple sparql query

In [12]:
q = """
select ?cls ?cls_label where {
  ?cls a owl:Class
  optional {?cls rdfs:label ?cls_label}
}
"""
sparql_query_to_df(q, g).head() # note: I only display the first 5 results

Unnamed: 0,cls,cls_label
0,:canine,canine
1,:crown_restoration,crown restoration
2,:data_value,data value
3,:dentist,dentist
4,:entity,


## add table and field instances to graph

In [13]:
g = add_table_metadata_to_graph(patients, 'patients', g, table_ns, field_ns, ns)
g = add_table_metadata_to_graph(providers, 'providers', g, table_ns, field_ns, ns)
g = add_table_metadata_to_graph(procedures, 'procedures', g, table_ns, field_ns, ns)

query to check that instaces where added

In [14]:
q = """
prefix : <https://data-field-punning.owl/>
select ?field ?type ?field_name ?table ?table_name where {
  ?field a :field;
         rdfs:label ?field_name;
         rdf:type ?type;
         :member_of ?table .
  ?table rdfs:label ?table_name .
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,field,type,field_name,table,table_name
0,:field/patients.patient_id,:field,patients.patient_id,:table/patients,patients
1,:field/patients.name,:field,patients.name,:table/patients,patients
2,:field/patients.gender,:field,patients.gender,:table/patients,patients
3,:field/patients.dob,:field,patients.dob,:table/patients,patients
4,:field/providers.provider_id,:field,providers.provider_id,:table/providers,providers
5,:field/providers.name,:field,providers.name,:table/providers,providers
6,:field/procedures.proc_id,:field,procedures.proc_id,:table/procedures,procedures
7,:field/procedures.proc_date,:field,procedures.proc_date,:table/procedures,procedures
8,:field/procedures.patient_id,:field,procedures.patient_id,:table/procedures,procedures
9,:field/procedures.provider_id,:field,procedures.provider_id,:table/procedures,procedures


## add enumerated values
The values in `patients.gender` and `procedures.proc_code` are enums. Let's add them to ontology shema.  
Note: For demonstration purposes, I've made enums url safe. In a real-world scenario, the enums would need to be url encoded.

In [15]:
genders = list(pysqldf("select distinct gender from patients")['gender'])
proc_codes = list(pysqldf("select distinct proc_code from procedures")['proc_code'])

In [16]:
g = add_enums_to_graph(genders, 'patients', 'gender', g, enum_ns, ns)
g = add_enums_to_graph(proc_codes, 'procedures', 'proc_code', g, enum_ns, ns)

query the enums added to graph

In [17]:
q = """
prefix : <https://data-field-punning.owl/>
select ?enum ?label ?value ?defines where {
  ?enum a :enumerated_value;
    rdfs:label ?label;
    :has_value ?value;
    :defines_values_in ?defines .
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,enum,label,value,defines
0,:enumerated_value/patients.gender#M,patients.gender M,M,:/field/patients.gender
1,:enumerated_value/patients.gender#F,patients.gender F,F,:/field/patients.gender
2,:enumerated_value/procedures.proc_code#d2300,procedures.proc_code d2300,d2300,:/field/procedures.proc_code
3,:enumerated_value/procedures.proc_code#d2400,procedures.proc_code d2400,d2400,:/field/procedures.proc_code
4,:enumerated_value/procedures.proc_code#d2500,procedures.proc_code d2500,d2500,:/field/procedures.proc_code
5,:enumerated_value/procedures.proc_code#d2600,procedures.proc_code d2600,d2600,:/field/procedures.proc_code
6,:enumerated_value/procedures.proc_code#d2700,procedures.proc_code d2700,d2700,:/field/procedures.proc_code
7,:enumerated_value/procedures.proc_code#d2800,procedures.proc_code d2800,d2800,:/field/procedures.proc_code


## add what the data represents
The data in the tables represent things in the world. We need to connect the data to their representations.  
I created a simple mapping between IRIs and the classes represented by the them. This could also be done using a `robot` template or `SSSOM` mapping file.  
Some of mappings are at the field level. For example, the patient_id field represents a patient in general. Other mappings are a the level of enumaterated values. For example, the value "F" in the patient.gender field represents a female.  
**Note**: This mapping involves punning the classes as inviduals b/c the represents object property holds between individuals.

In [18]:
for idx, iri, entity in pds.read_csv('../data/data_representations.csv').itertuples():
    uri = URIRef(iri)
    entity_uri = URIRef(entity)
    g.add((uri, ns.represents, entity_uri))

In [19]:
q = """
prefix : <https://data-field-punning.owl/>
select ?uri ?label ?represents where {
    ?uri :represents ?represents .
    optional {
      ?uri rdfs:label ?label
    }
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,uri,label,represents
0,:field/patients.patient_id,patients.patient_id,:patient
1,:field/procedures.patient_id,procedures.patient_id,:patient
2,:field/providers.provider_id,providers.provider_id,:dentist
3,:field/procedures.provider_id,procedures.provider_id,:dentist
4,:field/procedures.proc_id,procedures.proc_id,:procedure
5,:field/procedures.proc_code,procedures.proc_code,:procedure
6,:field/procedures.tooth_num,procedures.tooth_num,:tooth
7,:enumerated_value/patients.gender#M,patients.gender M,:male_patient
8,:enumerated_value/patients.gender#F,patients.gender F,:female_patient
9,:enumerated_value/procedures.proc_code#d2300,procedures.proc_code d2300,:crown_restoration


In [20]:
q = """
select patients.patient_id as Person, 'patient_id' as field_name from patients
union
select providers.provider_id as Person, 'provider_id' as field_name from providers
"""
pysqldf(q)

Unnamed: 0,Person,field_name
0,1,provider_id
1,2,provider_id
2,3,provider_id
3,1001,patient_id
4,1002,patient_id
5,1003,patient_id
6,1004,patient_id
7,1005,patient_id
8,1006,patient_id


In [28]:
q = """
prefix : <https://data-field-punning.owl/>
select distinct ?field_name ?cls_name where {
    ?cls rdfs:subClassOf :person;
         rdfs:label ?cls_label .
         
    ?field a :field;
        :represents ?cls;
        rdfs:label ?field_name .
    bind(replace(?cls_label, " ", "_") as ?cls_name)
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,field_name,cls_name
0,providers.provider_id,dentist
1,procedures.provider_id,dentist
2,patients.patient_id,patient
3,procedures.patient_id,patient


In [23]:
list(g.namespaces())

[('brick', rdflib.term.URIRef('https://brickschema.org/schema/Brick#')),
 ('csvw', rdflib.term.URIRef('http://www.w3.org/ns/csvw#')),
 ('dc', rdflib.term.URIRef('http://purl.org/dc/elements/1.1/')),
 ('dcat', rdflib.term.URIRef('http://www.w3.org/ns/dcat#')),
 ('dcmitype', rdflib.term.URIRef('http://purl.org/dc/dcmitype/')),
 ('dcterms', rdflib.term.URIRef('http://purl.org/dc/terms/')),
 ('dcam', rdflib.term.URIRef('http://purl.org/dc/dcam/')),
 ('doap', rdflib.term.URIRef('http://usefulinc.com/ns/doap#')),
 ('foaf', rdflib.term.URIRef('http://xmlns.com/foaf/0.1/')),
 ('odrl', rdflib.term.URIRef('http://www.w3.org/ns/odrl/2/')),
 ('org', rdflib.term.URIRef('http://www.w3.org/ns/org#')),
 ('owl', rdflib.term.URIRef('http://www.w3.org/2002/07/owl#')),
 ('prof', rdflib.term.URIRef('http://www.w3.org/ns/dx/prof/')),
 ('prov', rdflib.term.URIRef('http://www.w3.org/ns/prov#')),
 ('qb', rdflib.term.URIRef('http://purl.org/linked-data/cube#')),
 ('rdf', rdflib.term.URIRef('http://www.w3.org/19