In [None]:
import urllib.parse
from rdflib import Graph, Literal, Namespace, URIRef, BNode
from rdflib.namespace import RDF, RDFS, FOAF, OWL, XSD, DC, DCTERMS 
import pandas as pd

In [None]:
base_uri = "http://www.sanitasicilia.it/resource/"

g = Graph()
count = 0

sso = Namespace("http://www.sanitasicilia.it/ontology/")

g.bind("sso", sso)

ssr = Namespace("http://www.sanitasicilia.it/resource/")
g.bind("ssr", ssr)


In [None]:
def urify(ns, testo):
    testo=testo.replace(" ","_").replace("\'","")
    return ns+urllib.parse.quote(testo)

In [None]:
import re
from SPARQLWrapper import SPARQLWrapper, JSON

def interlinkToDbpedia(res, comune):
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    query = f"""
        SELECT ?res ?label
        WHERE {{
            ?res dbo:region dbr:Sicily .
            ?res rdfs:label ?label . 
            FILTER(LANG(?label)="it")
            FILTER REGEX(?label, "^{comune}", 'i') .
        }}
    """
    sparql.setQuery(query)

    sparql.setReturnFormat(JSON)

    results = sparql.query().convert()


    if len(results["results"]["bindings"]) > 0:
        result = results["results"]["bindings"][0]
        g.add( [res, OWL.sameAs, URIRef(result["res"]["value"])])
  
def interlinkToWikidata(res, comune):

    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    query = f"""SELECT ?res ?link
                WHERE {{
                    ?res dbo:region dbr:Sicily .
                    ?res rdfs:label ?label .
                    FILTER(LANG(?label)="it") .
                    FILTER REGEX(?label, "^{comune}", 'i') .
                    ?res owl:sameAs ?link .
                    FILTER REGEX(?link, 'wikidata', 'i') .
                }}
                LIMIT 1
            """

    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    if len(results["results"]["bindings"]) > 0:
        result = results["results"]["bindings"][0]
        g.add( [res, OWL.sameAs, URIRef(result["link"]["value"])])

def interlinkToGeonames(res, comune):

    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    query = f"""SELECT ?res ?link
                WHERE {{
                    ?res dbo:region dbr:Sicily .
                    ?res rdfs:label ?label .
                    FILTER(LANG(?label)="it") .
                    FILTER REGEX(?label, "^{comune}", 'i') .
                    ?res owl:sameAs ?link .
                    FILTER REGEX(?link, 'geonames', 'i') .
                }}
                LIMIT 1
            """

    
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    
    if len(results["results"]["bindings"]) > 0:
        result = results["results"]["bindings"][0]
        g.add( [res, OWL.sameAs, URIRef(result["link"]["value"])])
#Comuni
def addTriples(row):
    res = URIRef(urify(base_uri, row[0]))
    g.add([res, URIRef(RDF.type), URIRef(sso.Comune)])
    g.add([res, sso.hasName, Literal(row[0], datatype=XSD.string)])
    g.add([res, sso.hasTotalPopulation, Literal(row[1], datatype=XSD.integer)])
    g.add([res, sso.hasProvince, Literal(row[2], datatype=XSD.string)])
    g.add([res, sso.hasProvinceAcr, Literal(row[3], datatype=XSD.string)])

    #interlinking
    comune = row[0].replace('a\'', 'à').replace('o\'', 'ò').replace('e\'', 'è').replace('i\'', 'ì').replace('u\'', 'ù').title()
    interlinkToDbpedia(res, comune)
    interlinkToWikidata(res, comune)
    interlinkToGeonames(res, comune)
    
    


comuni_df = pd.read_csv("../datasets/csv/completed/comuni_sicilia.csv")

comuni_df.apply(lambda row : addTriples(row), axis=1)


In [None]:
#Farmacie

count = 0
row_prev = pd.Series()

def addBNodes(res, row):
    bn = BNode()

    g.add([res, sso.hasSite, bn])
    g.add([bn, sso.hasAddress, Literal(row[0], datatype=XSD.string) ])
   
    if(row[3] != 0):
        g.add([bn, sso.hasCap, Literal(row[3], datatype=XSD.integer) ])
    g.add([bn, sso.isIn, URIRef(urify(base_uri, row[4].title()))])
    g.add([bn, sso.tipology, Literal(row[7], datatype=XSD.string)])
    
    if(row[8] != 0 and row[9] != 0):
        g.add([bn, sso.hasLatitude, Literal(row[8], datatype=XSD.decimal) ])
        g.add([bn, sso.hasLongitude, Literal(row[9], datatype=XSD.decimal) ])

def addTriples(row):
    global count
    code = 'F'+f'{count:04}'
    res = URIRef(urify(base_uri, code))
    g.add([res, URIRef(RDF.type), URIRef(sso.Farmacia)])

    g.add([res, sso.hasName, Literal(row[1], datatype=XSD.string) ])
    g.add([res, sso.hasVatNumber, Literal(row[2], datatype=XSD.integer) ])

    addBNodes(URIRef(urify(base_uri, code)), row)
    count = count + 1


farmacie_df = pd.read_csv("../datasets/csv/completed/farmacie.csv")

farmacie_df.apply(lambda row : addTriples(row), axis=1)

g.serialize(destination="farmacie.ttl", format="turtle")

In [None]:
#Parafarmacie
count = 0
row_prev = pd.Series()

def addBNodes(res, row):
    bn = BNode()

    g.add([res, sso.hasSite, bn])
    g.add([bn, sso.hasAddress, Literal(row[1], datatype=XSD.string) ])
   
    if(row[3] != 0):
        g.add([bn, sso.hasCap, Literal(row[3], datatype=XSD.integer) ])
    g.add([bn, sso.isIn, URIRef(urify(base_uri, row[4].title()))])

    if(row[7] != 0 and row[8] != 0):
        g.add([bn, sso.hasLatitude, Literal(row[7], datatype=XSD.decimal) ])
        g.add([bn, sso.hasLongitude, Literal(row[8], datatype=XSD.decimal) ])

def addTriples(row):
    global count, row_prev

    if not row_prev.empty:
        if row_prev[0] == row[0]:
            code = 'PF'+f'{count-1:04}'
            addBNodes(URIRef(urify(base_uri, code)), row)
            return
    
    code = 'PF'+f'{count:04}'

    row_prev = pd.Series(row)

    res = URIRef(urify(base_uri, code))
    g.add([res, URIRef(RDF.type), URIRef(sso.Parafarmacia)])

    g.add([res, sso.hasName, Literal(row[0], datatype=XSD.string) ])
    g.add([res, sso.hasVatNumber, Literal(row[2], datatype=XSD.integer) ])

    addBNodes(URIRef(urify(base_uri, code)), row)
    count = count + 1

parafarmacie_df = pd.read_csv("../datasets/csv/completed/parafarmacie.csv")
parafarmacie_df["LATITUDINE"] = parafarmacie_df["LATITUDINE"].apply(lambda x : float(x.replace(',', '.')) if x != '-' else 0.0)
parafarmacie_df["LONGITUDINE"] = parafarmacie_df["LONGITUDINE"].apply(lambda x : float(x.replace(',', '.')) if x != '-' else 0.0)
parafarmacie_df["PARTITAIVA"] = parafarmacie_df["PARTITAIVA"].apply(lambda x : x if x != '-' else 0)


parafarmacie_df.apply(lambda row : addTriples(row), axis=1)

g.serialize(destination="../datasets/rdf/parafarmacie.ttl", format="turtle")

In [None]:
#Strutture sanitarie private
count = 0
row_prev = pd.Series()

def addBNodes(res, row):
    bn = BNode()

    g.add([res, sso.hasSite, bn])
    g.add([bn, sso.hasAddress, Literal(row[6], datatype=XSD.string) ])
   
    if(row[0] != 0):
        g.add([bn, sso.hasCap, Literal(row[0], datatype=XSD.integer) ])
    g.add([bn, sso.isIn, URIRef(urify(base_uri, row[5].title()))])

    if(row[1] != 0 and row[2] != 0):
        g.add([bn, sso.hasLatitude, Literal(row[1], datatype=XSD.decimal) ])
        g.add([bn, sso.hasLongitude, Literal(row[2], datatype=XSD.decimal) ])

    g.add([bn, sso.businessType, Literal(row[8], datatype=XSD.string)])
    g.add([bn, sso.hasSiteType, Literal(row[9], datatype=XSD.string)])

def addTriples(row):
    global count, row_prev

    if not row_prev.empty:
        if row_prev[7] == row[7]:
            code = 'SPVT'+f'{count-1:04}'
            addBNodes(URIRef(urify(base_uri, code)), row)
            return
    
    code = 'SPVT'+f'{count:04}'

    row_prev = pd.Series(row)

    res = URIRef(urify(base_uri, code))
    g.add([res, URIRef(RDF.type), URIRef(sso.StrutturaPrivata)])

    g.add([res, sso.hasName, Literal(row[7], datatype=XSD.string) ])
    g.add([res, sso.hasReferenceAsp, Literal(row[3], datatype=XSD.string)])
    addBNodes(URIRef(urify(base_uri, code)), row)
    count = count + 1

private_df = pd.read_csv("../datasets/csv/completed/private.csv")
private_df["Citta\'"] = private_df["Citta\'"].astype(str)
private_df.apply(lambda row : addTriples(row), axis=1)


In [None]:
#Strutture sanitarie pubbliche
import math

count = 0
row_prev = pd.Series()

def addBNodes(res, row):
    bn = BNode()

    g.add([res, sso.hasSite, bn])
   
    if(row[0] != ''):
        g.add([bn, sso.isIn, URIRef(urify(base_uri, row[0].title()))])

    if(row[1] != 0 and row[2] != 0):
        g.add([bn, sso.hasLatitude, Literal(row[1], datatype=XSD.decimal) ])
        g.add([bn, sso.hasLongitude, Literal(row[2], datatype=XSD.decimal) ])
   
    g.add([bn, sso.hasCap, Literal(row[3], datatype=XSD.integer) ])
    g.add([bn, sso.hasAddress, Literal(row[4], datatype=XSD.string)])
    g.add([bn, sso.businessType, Literal(row[7], datatype=XSD.string)])
    g.add([bn, sso.hasSiteType, Literal(row[8], datatype=XSD.string)])

    if row[9] != '':
        g.add([bn, sso.isRecoveryStructure, Literal(True, datatype=XSD.boolean)])
        g.add([bn, sso.hasRecoveryStructureName, Literal(row[10], datatype=XSD.string)])
    else:
        g.add([bn, sso.isRecoveryStructure, Literal(False, datatype=XSD.boolean)])

    g.add([bn, sso.workingMonths, Literal(row[10], datatype=XSD.integer)])
    g.add([bn, sso.openingDays, Literal(row[12], datatype=XSD.integer)])
    g.add([bn, sso.weeklyHours, Literal(row[13], datatype=XSD.integer)])
    g.add([bn, sso.hasEmail, Literal(row[15], datatype=XSD.string)])

    if row[16] != '':
        g.add([bn, sso.hasWebsite, Literal(row[16], datatype=XSD.string)])
   
    if not math.isnan(row[17]):
        g.add([bn, sso.hasPhonePrefix, Literal(row[17], datatype=XSD.integer)])
   
    if not math.isnan(row[18]):
        g.add([bn, sso.hasPhoneNumber, Literal(row[18], datatype=XSD.integer)])
    
    if not math.isnan(row[19]):
        g.add([bn, sso.hasFaxPrefix, Literal(row[19], datatype=XSD.integer)])
    
    if not math.isnan(row[20]):
        g.add([bn, sso.hasFaxNumber, Literal(row[20], datatype=XSD.integer)])

def addTriples(row):
    global count, row_prev

    if not row_prev.empty:
        if row_prev[5] == row[5] and row_prev[14] == row[14]:
            code = 'SPUB'+f'{count-1:04}'
            addBNodes(URIRef(urify(base_uri, code)), row)
            return
    
    row_prev = pd.Series(row)
    code = 'SPUB'+f'{count:04}'
    res = URIRef(urify(base_uri, code))
    g.add([res, URIRef(RDF.type), URIRef(sso.StrutturaPubblica)])
    g.add([res, sso.hasName, Literal(row[5], datatype=XSD.string) ])
    g.add([res, sso.hasVatNumber, Literal(row[14], datatype=XSD.integer)])

    addBNodes(res, row)
    count = count + 1



pubbliche_df = pd.read_csv("../datasets/csv/completed/pubbliche.csv")

pubbliche_df["Comune"] = pubbliche_df["Comune"].astype(str)
pubbliche_df["Denominazione struttura di ricovero"] = pubbliche_df["Denominazione struttura di ricovero"].astype(str)
pubbliche_df["Sito web"] = pubbliche_df["Sito web"].astype(str)

pubbliche_df.apply(lambda row : addTriples(row), axis=1)



In [None]:
g.serialize(destination='../datasets/rdf/sanitasicilia.ttl', 
            format='turtle')