In [1]:
import pandas as pds
import rdflib
import itertools
from rdflib import URIRef, BNode, Literal, Graph, Namespace, RDF, RDFS, OWL
from rdflib.plugins.sparql.processor import SPARQLResult
from SPARQLWrapper import SPARQLWrapper
from pandasql import sqldf
from typing import Optional, List, Any
pysqldf = lambda q: sqldf(q, globals())

## load data

In [2]:
patients = pds.read_csv('../data/patients.csv')
providers = pds.read_csv('../data/providers.csv')
procedures = pds.read_csv('../data/procedures.csv')

test with simple sql query

In [3]:
pysqldf('select * from procedures')

Unnamed: 0,proc_id,proc_date,patient_id,provider_id,tooth_num,proc_code
0,1,2020-01-01,1001,1,1,d2300
1,2,2020-02-01,1002,2,2,d2400
2,3,2020-03-01,1003,3,3,d2500
3,4,2020-04-01,1004,1,4,d2600
4,5,2020-05-01,1005,2,5,d2700
5,6,2020-06-01,1006,3,6,d2800


## helper functions
* **add_spo**: shortcut for adding subject, predicate, object URIRefs to graph
* **add_spo**: shortcut for adding subject, predicate URIRefs and literal value to graph
* **sparql_results_to_df**: converts sparql results into a dataframe
* **sparql_query_to_df**: queries the graph and returns the results as a dataframe
* **add_table_metadata_to_graph**: adds instances of tables and fields to graph
* **add_enums_to_graph**: adds instances of enumerated values to graph
* **dataframe_to_sql**: creates an sql query based on values in a dataframe (with specific columns)
* **add_dataframe_to_graph**: adds a simple rdf transformation of a dataframe into the graph

In [4]:
def add_spo(graph: Graph, subj: Any, predicate: Any, obj: Any) -> Graph:
    """shortcut for adding subject, predicate, object URIRefs to graph"""
    
    graph.add((URIRef(subj), URIRef(predicate), URIRef(obj)))
    return graph

In [5]:
def add_spv(graph: Graph, subj: Any, predicate: Any, val: Any) -> Graph:
    """shortcut for adding subject, predicate URIRefs and literal value to graph"""
    
    graph.add((URIRef(subj), URIRef(predicate), Literal(val)))
    return graph

In [6]:
def sparql_results_to_df(results: SPARQLResult, graph=Optional[Graph]) -> pds.DataFrame:
    """converts sparql results into a dataframe"""
    
    def set_value(x):
        if x is None:
            return None
        elif graph is not None:
            for n in graph.namespaces():
                # each is a tuple of form (<prefix>, URIRef(...))
                # e.g., ('dc', rdflib.term.URIRef('http://purl.org/dc/elements/1.1/'))
                if str(x).startswith(str(n[1])):
                    # replace uri with prefix
                    return str(x).replace(n[1], n[0])
                
            # if it makes it here, no replacements occurred
            return x.toPython()
        else:
            return x.toPython()

    return \
        pds.DataFrame(
            data=([set_value(x) for x in row] for row in results),
            columns=[str(x) for x in results.vars]
        )

In [7]:
def sparql_query_to_df(query: str, graph: Graph, use_ns=True) -> pds.DataFrame:
    """queries the graph and returns the results as a dataframe"""
    
    results = graph.query(query)
    if use_ns:
        return sparql_results_to_df(results, graph)
    else:
        return sparql_results_to_df(results, None)

In [8]:
def add_table_metadata_to_graph(table: pds.DataFrame, 
                                table_name: str, 
                                graph: Graph, 
                                table_ns: Namespace, 
                                field_ns: Namespace, 
                                property_ns: Namespace) -> Graph:
    """adds instances of tables and fields to graph"""
    
    # add table instance to graph
    table_uri = table_ns[f'/{table_name}']
    graph = add_spo(graph, table_uri, RDF.type, table_ns)
    graph = add_spv(graph, table_uri, RDFS.label, table_name)
        
    # add each of the tables fields to graph as instances and subclass of fields
    for field_name in table.columns:
        field_name = f'{table_name}.{field_name}' # prepend table name to field name
        uri = URIRef(field_ns[f'/{field_name}'])
        graph = add_spo(graph, uri, RDF.type, field_ns)
        grpah = add_spo(graph, uri, property_ns.member_of, table_uri)
        graph = add_spv(graph, uri, RDFS.label, field_name)
        
        # *pun* the field as an owl class
        # note: field_ns[:-1] removes the last "/" from the uri
        graph = add_spo(graph, uri, RDF.type, OWL.Class)
        graph = add_spo(graph, uri, RDFS.subClassOf, field_ns[:-1])
    
    return graph

In [9]:
def add_enums_to_graph(enums: List, 
                       table_name: str, 
                       field_name: str, 
                       graph: Graph, 
                       enum_ns: Namespace, 
                       base_ns: Namespace) -> Graph:
    """adds instances of enumerated values to graph"""
    
    field_name = f'{table_name}.{field_name}' # prepend table name to field name
    for enum in enums:
        # build uri
        uri = URIRef(enum_ns[f'/{field_name}#{enum}'])

        # add instances
        # Note: the literal value is added to the graph as well
        graph = add_spo(graph, uri, RDF.type, enum_ns)
        graph = add_spv(graph, uri, base_ns.has_value, enum)
        graph = add_spv(graph, uri, RDFS.label, f'{field_name} {enum}')

        # enums constrain values in fields, so add this informaton the graph
        field = base_ns[f'field/{field_name}']
        graph = add_spo(graph, uri, base_ns.defines_values_in, field)
        
    return graph

In [10]:
def dataframe_to_sql(df: pds.DataFrame) -> str:
    """creates an sql query based on values in a dataframe (with specific columns)"""
    # N.B.: This need to be refactored!!!
    
    # dict to hold info about the table and class the field represents
    # note: check for enum_value column
    if 'enum_value' in df.columns:
        columns = ['field_name', 'table_name', 'cls_names', 'enum_value']
    else:
        columns = ['field_name', 'table_name', 'cls_names']
        
    field_info_dict = \
        df[columns].drop_duplicates().set_index('field_name').to_dict(orient='index')
    
    
    # gather all table and field names in dataframe
    all_tables = set(field_df['table_name'])
    all_fields = set(field_df['field_name'])
    
    # dict to hold whether the table occurs in a 'from' clause or 'join' clause
    table_clause_dict = {'from': '', 'joins': []}
    for i, tbl in enumerate(all_tables):
        if i == 0:
            table_clause_dict['from'] = tbl
        else:
            table_clause_dict['joins'].append(tbl)
    
    # dict to hold just those tables that occur in 'join' clause
    # this dict just help build the sql query
    table_join_dict = \
        {tbl:{'joins': []} for tbl in all_tables if tbl != table_clause_dict['from']}
    
    # iterate over each class in the dataframe
    # b/c each field is part of the same group we know the reprenst the same type of thing
    # so, we can relate the fields in the join clause
    for group_name, group_df in df.groupby('cls_names'):
        # all fields in the same group represent the same type of thing
        # find the pairwise combinations of how each field relates to one another
        field_pairs = \
            list(itertools.combinations(group_df['field_name'], 2))
        
        # now for each pair add it to the appropriate join table
        for left_field, right_field in field_pairs:
            # fetch table names of fields
            left_table = field_info_dict[left_field]['table_name']
            right_table = field_info_dict[right_field]['table_name']
            
            # add pair as a join condition (e.g. patients.patient_id = procedures.patient_id)
            if left_table != table_clause_dict['from']:
                table_join_dict[left_table]['joins'].append((left_field, right_field))
            elif right_table != table_clause_dict['from']:
                table_join_dict[right_table]['joins'].append((left_field, right_field))

    # finally, build sql query
    sql = ""
    
    # SELECT ...
    # form the select list for the sql
    select_fields = \
        [f"  {field} as [{field} ({value['cls_names']})]\n" 
        for field, value in field_info_dict.items()]
    
    select_fields = []
    for field, value in field_info_dict.items():
        # the "as [{field}" is needed to guarantee that the whole
        # field name (i.e., <table>.<field name>) is used in the results
        if 'cls_names' in value.keys():
            select_fields.append(f"  {field} as [{field} ({value['cls_names']})] \n")
        else:
            select_fields.append(f"  {field} as [{field}]\n)")
                    
    sql = sql + f"select \n{'  ,'.join(select_fields)} \n"
    
    # FROM ... JOIN ...
    # add tables that data is retrieved from (note: there must be a 'from' clause)
    if len(table_clause_dict['from']) > 0:
        sql = sql + f"from {table_clause_dict['from']} \n"
        
        #  collect joined tables
        for tbl in table_clause_dict['joins']:
            join_fields = table_join_dict[tbl]['joins']
            if len(join_fields) > 0:
                sql = sql + f"inner join {tbl} on \n"
                
                # put an '=' between each pair of fields
                field_pairs = [f"{' = '.join(field_pair)}\n" for field_pair in join_fields]
                sql = sql + f"  {'  and '.join(field_pairs)}"
    
    # WHERE ...
    # check if enum value used for filter
    if 'enum_value' in df.columns:
        # 
        print_where = True
        for idx, field_name, enum_value in df[['field_name', 'enum_value']].itertuples():
            if print_where and enum_value is not None and len(enum_value) > 0:
                sql = sql + f"where {field_name} = '{enum_value}' \n"
                print_where = False
            elif enum_value is not None and len(enum_value) > 0:
                sql = sql + f"and {field_name} = '{enum_value}' \n"
                
    # return sql query
    return sql


In [11]:
def add_dataframe_to_graph(graph: Graph, df: pds.DataFrame, table_name, field_ns: Namespace, base_ns: Namespace) -> Graph:
    """adds a simple rdf transformation of a dataframe into the graph"""
    
    for row in df.itertuples(index=None):
        # add row as blank node to table
        row_uri = BNode() 
        table_uri = base_ns[f'table/{table_name}']
        graph = add_spo(graph, row_uri, RDF.type, base_ns.row)
        graph = add_spo(graph, row_uri, base_ns.member_of, table_uri)
        
        # for each field in row
        for field_name, value in row._asdict().items():
            # add instance of field value using blank node
            field_value_uri = BNode()
            graph = add_spo(graph, field_value_uri, RDF.type, base_ns.field_value)
            graph = add_spv(graph, field_value_uri, base_ns.has_value, value)
                        
            # relate row to field value using *punned* field name
            field_uri = field_ns[f'/{table_name}.{field_name}']
            graph = add_spo(graph, field_uri, RDF.type, OWL.ObjectProperty)
            graph = add_spo(graph, row_uri, field_uri, field_value_uri)
            
            # relate field value to field and field to row via 'member of'
            field_instance_uri = BNode()
            graph = add_spo(graph, field_instance_uri, RDF.type, field_uri)
            graph = add_spo(graph, field_instance_uri, base_ns.member_of, row_uri)
            graph = add_spo(graph, field_value_uri, base_ns.member_of, field_instance_uri)


            
    return graph

## load ontology

In [12]:
g = Graph()
g.parse('../ontology/data-field-punning.ttl', format='ttl')
g.bind(":", Namespace("https://data-field-punning.owl/")) # you can also use g.namespace_manager.bind(...)
g.bind("field:", Namespace("https://data-field-punning.owl/field/"))
g.bind("table:", Namespace("https://data-field-punning.owl/table/"))
g.bind("enum:", Namespace("https://data-field-punning.owl/enumerated_value/"))

add some namespaces to use as shortcuts

In [13]:
ns = Namespace("https://data-field-punning.owl/")
field_ns = Namespace(ns.field)
table_ns = Namespace(ns.table)
enum_ns = Namespace(ns.enumerated_value)

test simple sparql query

In [14]:
q = """
select ?cls ?cls_label where {
  ?cls a owl:Class
  optional {?cls rdfs:label ?cls_label}
}
"""
sparql_query_to_df(q, g).head() # note: I only display the first 5 results

Unnamed: 0,cls,cls_label
0,:canine,canine
1,:crown_restoration,crown restoration
2,:data_value,data value
3,:dentist,dentist
4,:entity,


## add table and field instances to graph

In [15]:
g = add_table_metadata_to_graph(patients, 'patients', g, table_ns, field_ns, ns)
g = add_table_metadata_to_graph(providers, 'providers', g, table_ns, field_ns, ns)
g = add_table_metadata_to_graph(procedures, 'procedures', g, table_ns, field_ns, ns)

query to check that instaces where added

In [16]:
q = """
prefix : <https://data-field-punning.owl/>
select ?field ?type ?field_name ?table ?table_name where {
  ?field a :field;
         rdfs:label ?field_name;
         rdf:type ?type;
         :member_of ?table .
  ?table rdfs:label ?table_name .
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,field,type,field_name,table,table_name
0,:field/patients.patient_id,:field,patients.patient_id,:table/patients,patients
1,:field/patients.patient_id,owlClass,patients.patient_id,:table/patients,patients
2,:field/patients.name,:field,patients.name,:table/patients,patients
3,:field/patients.name,owlClass,patients.name,:table/patients,patients
4,:field/patients.gender,:field,patients.gender,:table/patients,patients
5,:field/patients.gender,owlClass,patients.gender,:table/patients,patients
6,:field/patients.dob,:field,patients.dob,:table/patients,patients
7,:field/patients.dob,owlClass,patients.dob,:table/patients,patients
8,:field/patients.provider_id,:field,patients.provider_id,:table/patients,patients
9,:field/patients.provider_id,owlClass,patients.provider_id,:table/patients,patients


## add enumerated values
The values in `patients.gender` and `procedures.proc_code` are enums. Let's add them to ontology shema.  
Note: For demonstration purposes, I've made enums url safe. In a real-world scenario, the enums would need to be url encoded.

In [17]:
genders = list(pysqldf("select distinct gender from patients")['gender'])
proc_codes = list(pysqldf("select distinct proc_code from procedures")['proc_code'])

In [18]:
g = add_enums_to_graph(genders, 'patients', 'gender', g, enum_ns, ns)
g = add_enums_to_graph(proc_codes, 'procedures', 'proc_code', g, enum_ns, ns)

query the enums added to graph

In [19]:
q = """
prefix : <https://data-field-punning.owl/>
select ?enum ?label ?value ?defines where {
  ?enum a :enumerated_value;
    rdfs:label ?label;
    :has_value ?value;
    :defines_values_in ?defines .
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,enum,label,value,defines
0,:enumerated_value/patients.gender#M,patients.gender M,M,:field/patients.gender
1,:enumerated_value/patients.gender#F,patients.gender F,F,:field/patients.gender
2,:enumerated_value/procedures.proc_code#d2300,procedures.proc_code d2300,d2300,:field/procedures.proc_code
3,:enumerated_value/procedures.proc_code#d2400,procedures.proc_code d2400,d2400,:field/procedures.proc_code
4,:enumerated_value/procedures.proc_code#d2500,procedures.proc_code d2500,d2500,:field/procedures.proc_code
5,:enumerated_value/procedures.proc_code#d2600,procedures.proc_code d2600,d2600,:field/procedures.proc_code
6,:enumerated_value/procedures.proc_code#d2700,procedures.proc_code d2700,d2700,:field/procedures.proc_code
7,:enumerated_value/procedures.proc_code#d2800,procedures.proc_code d2800,d2800,:field/procedures.proc_code


## add what the data represents
The data in the tables represent things in the world. We need to connect the data to their representations.  
I created a simple mapping between IRIs and the classes represented by the them. This could also be done using a `robot` template or `SSSOM` mapping file.  
Some of mappings are at the field level. For example, the patient_id field represents a patient in general. Other mappings are a the level of enumaterated values. For example, the value "F" in the patient.gender field represents a female.  
**Note**: This mapping involves punning the classes as inviduals b/c the represents object property holds between individuals.

In [20]:
for idx, iri, entity in pds.read_csv('../data/data_representations.csv').itertuples():
    uri = URIRef(iri)
    entity_uri = URIRef(entity)
    g.add((uri, ns.represents, entity_uri))

In [21]:
q = """
prefix : <https://data-field-punning.owl/>
select ?uri ?label ?represents where {
    ?uri :represents ?represents .
    optional {
      ?uri rdfs:label ?label
    }
}
"""
sparql_query_to_df(q, g)

Unnamed: 0,uri,label,represents
0,:field/patients.patient_id,patients.patient_id,:patient
1,:field/procedures.patient_id,procedures.patient_id,:patient
2,:field/patients.provider_id,patients.provider_id,:dentist
3,:field/providers.provider_id,providers.provider_id,:dentist
4,:field/procedures.provider_id,procedures.provider_id,:dentist
5,:field/procedures.proc_id,procedures.proc_id,:procedure
6,:field/procedures.proc_code,procedures.proc_code,:procedure
7,:field/procedures.tooth_num,procedures.tooth_num,:tooth
8,:enumerated_value/patients.gender#M,patients.gender M,:male_patient
9,:enumerated_value/patients.gender#F,patients.gender F,:female_patient


## use representations to form sql queries

Find every field name that represents a `person`.  
**note**: This finds all fields that represent a subclass of `person`.

In [22]:
q = """
prefix : <https://data-field-punning.owl/>
select distinct ?table_name ?field_name (group_concat(?cls_name) as ?cls_names) where {
    ?cls rdfs:subClassOf :person;
         rdfs:label ?cls_label .
         
    ?field a :field;
        :represents ?cls;
        rdfs:label ?field_name;
        :member_of [a :table; rdfs:label ?table_name] .
        
    bind(replace(?cls_label, " ", "_") as ?cls_name)
}
group by ?table_name ?field_name
order by ?field_name
"""
field_df = sparql_query_to_df(q, g)
field_df

Unnamed: 0,table_name,field_name,cls_names
0,patients,patients.patient_id,patient
1,patients,patients.provider_id,dentist
2,procedures,procedures.patient_id,patient
3,procedures,procedures.provider_id,dentist
4,providers,providers.provider_id,dentist


### build SQL query

In [23]:
q = dataframe_to_sql(field_df)
print(q)

select 
  patients.patient_id as [patients.patient_id (patient)] 
  ,  patients.provider_id as [patients.provider_id (dentist)] 
  ,  procedures.patient_id as [procedures.patient_id (patient)] 
  ,  procedures.provider_id as [procedures.provider_id (dentist)] 
  ,  providers.provider_id as [providers.provider_id (dentist)] 
 
from providers 
inner join procedures on 
  procedures.provider_id = providers.provider_id
inner join patients on 
  patients.provider_id = procedures.provider_id
  and patients.provider_id = providers.provider_id
  and patients.patient_id = procedures.patient_id



In [24]:
sqldf(q)

Unnamed: 0,patients.patient_id (patient),patients.provider_id (dentist),procedures.patient_id (patient),procedures.provider_id (dentist),providers.provider_id (dentist)
0,1001,1,1001,1,1
1,1004,1,1004,1,1
2,1002,2,1002,2,2
3,1005,2,1005,2,2
4,1003,3,1003,3,3
5,1006,3,1006,3,3


## use enumerated values to filter data

Final procedures that were rooth canals.  

In [25]:
q = """
prefix : <https://data-field-punning.owl/> 
select distinct ?table_name ?field_name ?cls_names ?enum_value
where {

    ?field a :field;
        :represents ?cls;
        rdfs:label ?field_name;
        :member_of [a :table; rdfs:label ?table_name] . 
    
    optional {
    ?enum a :enumerated_value;
        :has_value ?enum_value;
        :defines_values_in ?field;
        :represents :root_canal .
    }
    
    ?cls rdfs:label ?cls_names .
    filter(?cls = :procedure || ?cls = :patient)
}
"""
field_df = sparql_query_to_df(q, g)
field_df

Unnamed: 0,table_name,field_name,cls_names,enum_value
0,patients,patients.patient_id,patient,
1,procedures,procedures.proc_id,procedure,
2,procedures,procedures.patient_id,patient,
3,procedures,procedures.proc_code,procedure,d2600


## build sql query

In [26]:
q = dataframe_to_sql(field_df)
print(q)

select 
  patients.patient_id as [patients.patient_id (patient)] 
  ,  procedures.proc_id as [procedures.proc_id (procedure)] 
  ,  procedures.patient_id as [procedures.patient_id (patient)] 
  ,  procedures.proc_code as [procedures.proc_code (procedure)] 
 
from procedures 
inner join patients on 
  patients.patient_id = procedures.patient_id
where procedures.proc_code = 'd2600' 



In [27]:
sqldf(q)

Unnamed: 0,patients.patient_id (patient),procedures.proc_id (procedure),procedures.patient_id (patient),procedures.proc_code (procedure)
0,1004,4,1004,d2600


## add relations between fields
By adding relations between the fields, we can query for how entities represented by the data in the fields are related.  
For demonstration purposes, the relations are added to the graph directly. However, this information can also be in an external table.  
**To Do**: Write code to turn results into a a sql query.

In [28]:
g = add_spo(g, field_ns['/procedures.tooth_num'], ns['part_of'], field_ns['/procedures.patient_id'])
g = add_spo(g, field_ns['/procedures.tooth_num'], ns['participates_in'], field_ns['/procedures.proc_code'])

Find the fields whose data reprents the entities that a `tooth` is `part of` or `participates in`.  
**note**: The query searches for the field that *represents* a `tooth`, not the field itself.

In [29]:
q = """
prefix : <https://data-field-punning.owl/>
select distinct ?subj_field ?predicate ?obj_field where {
  ?subj 
      rdfs:label ?subj_field;
      :represents :tooth .
      
  {
    bind(:part_of as ?pred)
    ?pred rdfs:label ?predicate .
    
    ?subj ?pred ?obj .
    ?obj rdfs:label ?obj_field .
  } union {
    bind(:participates_in as ?pred)
    ?pred rdfs:label ?predicate .
    
    ?subj ?pred ?obj .
    ?obj rdfs:label ?obj_field .
  }
    
}
"""
# field_df = sparql_query_to_df(q, g)
# field_df
sparql_query_to_df(q, g)

Unnamed: 0,subj_field,predicate,obj_field
0,procedures.tooth_num,participates in,procedures.proc_code
1,procedures.tooth_num,part of,procedures.patient_id


## create a simple translation of the data into RDF

In [30]:
g = add_dataframe_to_graph(g, patients, 'patients', field_ns, ns)
g = add_dataframe_to_graph(g, procedures, 'procedures', field_ns, ns)
g = add_dataframe_to_graph(g, providers, 'providers', field_ns, ns)

Query to see if insantace data was added using both:
* punned field names
* instances of field values that members of field instances

In [31]:
q = """
prefix : <https://data-field-punning.owl/>
select ?row ?field_name ?value where {
  ?field a :field;
      rdfs:label ?field_name .
      
  ?row a :row; 
      ?field [:has_value ?value] .
} limit 5
"""
sparql_query_to_df(q, g)

Unnamed: 0,row,field_name,value
0,N57818d1f022c409fbd785c37085af39f,patients.patient_id,1001
1,N44df6a0e15104ea4b87727242bf99d77,patients.patient_id,1002
2,Ndbd4ffd8e872404b9d2b541d64732712,patients.patient_id,1003
3,N4bf5093582d4479ba03db9ea43eeb35c,patients.patient_id,1004
4,Nccf54dff4e2f4a81868b5751c93add19,patients.patient_id,1005


In [32]:
q = """
prefix : <https://data-field-punning.owl/>
select ?row ?field_name ?value where {
  ?row a :row .
  
  ?field rdfs:subClassOf :field;
      rdfs:label ?field_name .
      
  ?field_i a ?field;
      :member_of ?row.
      
  ?field_value_i a :field_value;
      :member_of ?field_i;
      :has_value ?value .


} limit 5
"""
sparql_query_to_df(q, g)

Unnamed: 0,row,field_name,value
0,N57818d1f022c409fbd785c37085af39f,patients.patient_id,1001
1,N44df6a0e15104ea4b87727242bf99d77,patients.patient_id,1002
2,Ndbd4ffd8e872404b9d2b541d64732712,patients.patient_id,1003
3,N4bf5093582d4479ba03db9ea43eeb35c,patients.patient_id,1004
4,Nccf54dff4e2f4a81868b5751c93add19,patients.patient_id,1005


## query for teeth are are part of patient
Above we related the `tooth_num` and `patient_id` fields like so:
* `procedures.tooth_num` `part of` `procedures.patient_id`

In [67]:
q = """
base <https://data-field-punning.owl/>
prefix : <https://data-field-punning.owl/>
select ?patient ?tooth where {  
  ?tooth_field_uri :represents :tooth .
  ?patient_field_uri :represents :patient .
  
  ?tooth_field_uri
      rdf:type :field;
      :part_of ?patient_field_uri .
  
  ?row a :row;
      ?tooth_field_uri [:has_value ?tooth];
      ?patient_field_uri [:has_value ?patient] .
        
} limit 5
"""
sparql_query_to_df(q, g)

Unnamed: 0,patient,tooth
0,1001,1
1,1002,2
2,1003,3
3,1004,4
4,1005,5


## legacy --------------------------------

In [34]:
def match_joins(table_names, table_join_dict, table_field_pairs):
    if 0 == len(table_names):
        return table_join_dict
    
    table_name = table_names[0]
    print(f'\n--- {table_name} ---\n')
    joins = set()
    for left, right, in table_field_pairs:
        field_pair = (left[1], right[1])
        if table_name in (left[0], right[0]):
            # joins.add(field_pair)
            # table_join_dict[table_name]['joins'].add(field_pair)
            table_join_dict[table_name]['joins'].add((left, right))
            joins.add((left, right))
            
        # if left[0] == table_name:
        #     table_join_dict[table_name]['joins'].add(field_pair)
        #     # table_field_pairs.remove((left, right))
        #     print('pairs', table_field_pairs)
        #     print('left', field_pair)
        # elif right[0] == table_name:
        #     table_join_dict[table_name]['joins'].add(field_pair)
        #     # table_field_pairs.remove((left, right))
        #     print('pairs', table_field_pairs)
        #     print('right', field_pair)
        # else:
        #     print('pairs', table_field_pairs)
        #     print('no match', field_pair)
    print('joins', joins)
    for j in joins:
        table_field_pairs.remove(j)
        
    return match_joins(table_names[1:], table_join_dict, table_field_pairs)