In [21]:
import yaml
import rdflib
import pandas as pd
import re

In [14]:
with open("map.yml", "r") as f:
    data = yaml.safe_load(f.read())

In [15]:
g = rdflib.Graph()
ns_lookup = {}

for ns in data['namespaces']:
    g.namespace_manager.bind(namespace=ns['uri'], prefix=ns['name'])
    ns_lookup[ns['name']] = rdflib.Namespace(ns['uri'])


In [50]:
def csv_to_graph(mappings, filename, graph):
    if mappings:
        for row in pd.read_csv(filename).to_dict(orient="records"):
            uriref = mappings.get('uriref')
            if uriref:
                identifier = rdflib.URIRef(row.get(uriref))
            else:
                identifier = rdflib.BNode()
            for key, value in ((key, value) for key, value in mappings.items() if 'uriref' not in key):
                ns, ns_attr = key.split(".")
                if type(value) == list:
                    for val in value:
                        if not pd.isna(row[val]):
                            '''
                            There needs to be a test URI vs. string here, since URIs are supposed
                            to be wrapped in brackets (<http://someuri.org>). This is currently not
                            working as expected, but see below for proof of concept.
                            ''' 
                            uri_test = re.search('^http', val)
                            if (uri_test != None):
                                graph.add( (identifier, ns_lookup[ns][ns+attr], rdflib.URIRef(row[val])) )
                            else:
                                graph.add( (identifier, ns_lookup[ns][ns_attr], rdflib.Literal(row[val])) )
                else:
                    if not pd.isna(row[value]):
                        uri_test = re.search('^http', value)
                        if (uri_test != None):
                                graph.add( (identifier, ns_lookup[ns][ns+attr], rdflib.URIRef(row[value])) )
                        else:
                                graph.add( (identifier, ns_lookup[ns][ns_attr], rdflib.Literal(row[value])) )

for data_file in data['mappings']:
    filename = f"data/{data_file}.csv"
    mappings = data['mappings'][data_file]
    csv_to_graph(mappings, filename, g)


In [46]:
g.serialize(destination='output/triple.txt', format='ntriples')

In [47]:
# Proof of concept for test. This searches the beginning of a string for 'http'.
uri = 'http://someurl.com'
x = re.search('^http', uri)
if x != None:
    print(string)
else:
    print('That is not a URI')

http://someurl.com


In [49]:
not_uri = 'I am Sam. Sam I am.'
x = re.search('^http', not_uri)
if x != None:
    print(string)
else:
    print('That is not a URI')

That is not a URI
