In [1]:
import csv
import pandas as pd
from owlready2 import *
import owlrl
from lookup import DBpediaLookup
from stringcmp import isub

from rdflib import Graph
from rdflib import URIRef, BNode, Literal
from rdflib import Namespace
from rdflib.namespace import OWL, RDF, RDFS, FOAF, XSD




In [2]:
# using_builtin_libary("worldcities-free-100.csv")
df = pd.read_csv("worldcities-free-100.csv", sep=',', quotechar='"',escapechar="\\")

In [18]:
g = Graph()

namespace_str = 'http://www.semanticweb.org/zacharias.detorakis/ontologies/2021/2/lab#'
ex = Namespace(namespace_str)
g.bind("ex", ex)

for index, row in df.iterrows():
    
    # Create the URIs for the cities and countries
    city = URIRef(namespace_str+row['city_ascii'].replace(" ", "_"))
    country = URIRef(namespace_str+row['country'].replace(" ", "_"))
    g.add((city, RDF.type, ex.City))
    g.add((country, RDF.type, ex.Country))
    
    # Add city properties
    g.add((city, ex.name, Literal(row['city'])))
    g.add((city, ex.cityAscii, Literal(row['city_ascii'])))
    g.add((city, ex.adminName, Literal(row['admin_name'])))
    g.add((city, ex.lattitude, Literal(row['lat'])))
    g.add((city, ex.longitude, Literal(row['lng'])))
    g.add((city, ex.population, Literal(row['population'])))
    g.add((city, ex.hasCountry, country))
    if (row['capital'] == row['capital']): #check for nan
        g.add((city, ex.capitalStatus, Literal(row['capital'])))

    # Add country data properties
    g.add((country, ex.iso2Code,  Literal(row['iso2'])))
    g.add((country, ex.iso3_code,  Literal(row['iso3'])))
    g.add((country, ex.name,  Literal(row['country'])))
    
    if (row['capital'] == 'primary'):
        g.add((country, ex.hasCapital, city))
    
# print(g.serialize(format="turtle").decode("utf-8"))
g.serialize(destination='lab3_data_graph.ttl', format='ttl')

In [20]:
g.serialize(destination='lab3_data_graph_inferred.ttl', format='ttl')

In [19]:
print("Triples after loading the CSV: '" + str(len(g)) + "'.")
g.parse("lab6_rdflib.ttl", format="ttl")
print("Triples after ontology loading: '" + str(len(g)) + "'.")
owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=True, datatype_axioms=False).expand(g)
print("Triples after OWL 2 RL reasoning: '" + str(len(g)) + "'.")


Triples after loading the CSV: '1015'.
Triples after ontology loading: '1070'.
Triples after OWL 2 RL reasoning: '5150'.


In [25]:
# Initialise the graph
g = Graph()

# parse the data created in task 3
g.parse("lab3_data_graph.ttl", format="ttl")
# g.parse("lab3_data_graph-full.ttl", format="ttl")

# construct and run the SPARQL query
qres = g.query(
    """SELECT ?name ?capital ?population ?country_name where {
      ?city rdf:type ex:City;
              ex:name ?name;
              ex:population ?population;
              ex:capitalStatus ?capital;
              ex:hasCountry ?country.
      ?country ex:name ?country_name.
              
      FILTER (STR(?capital)="primary").
      FILTER (?population>5000000).
      
    }
    ORDER BY ?country_name
    """)
print(len(qres))

print(f"'country_name','city_name','capital','population'")
for row in qres:
#     Row is a list of matched RDF terms: URIs, literals or blank nodes
    print(f"'{str(row.country_name)}','{str(row.name)}','{str(row.capital)}',{str(row.population)}")

24
'country_name','city_name','capital','population'
'Angola','Luanda','primary',8417000
'Argentina','Buenos Aires','primary',16157000
'Bangladesh','Dhaka','primary',15443000
'Chile','Santiago','primary',7007000
'China','Beijing','primary',19433000
'Colombia','Bogotá','primary',9464000
'Congo (Kinshasa)','Kinshasa','primary',13528000
'Egypt','Cairo','primary',19372000
'France','Paris','primary',11020000
'Indonesia','Jakarta','primary',34540000
'Iran','Tehran','primary',13633000
'Japan','Tokyo','primary',37977000
'Korea, South','Seoul','primary',21794000
'Malaysia','Kuala Lumpur','primary',8285000
'Mexico','Mexico City','primary',20996000
'Peru','Lima','primary',9848000
'Philippines','Manila','primary',23088000
'Russia','Moscow','primary',17125000
'Saudi Arabia','Riyadh','primary',6881000
'Sudan','Khartoum','primary',7282000
'Tanzania','Dar es Salaam','primary',6698000
'Thailand','Bangkok','primary',17066000
'United Kingdom','London','primary',10979000
'Vietnam','Hanoi','primary',778500

In [6]:
len(qres)

24

In [3]:
def getExternalKGURI(name):
    '''
    Approximate solution: We get the entity with highest lexical similarity
    The use of context may be necessary in some cases        
    '''
    
    dbpedia = DBpediaLookup()
    entities = dbpedia.getKGEntities(name, 5)
    #print("Entities from DBPedia:")
    current_sim = -1
    current_uri=''
    for ent in entities:           
        isub_score = isub(name, ent.label) 
        if current_sim < isub_score:
            current_uri = ent.ident
            current_sim = isub_score

#     print(current_uri)
    return current_uri 

In [8]:
g = Graph()

namespace_str = 'http://www.semanticweb.org/zacharias.detorakis/ontologies/2021/2/lab/'
ex = Namespace(namespace_str)
g.bind("ex", ex)

dpo = Namespace("http://dbpedia.org/resource/")
g.bind("dpo", dpo)

for index, row in df.iterrows():
    
    # Create the URIs for the cities and countries
    city = URIRef(namespace_str+row['city_ascii'].replace(" ", "_"))
    externalUri =  getExternalKGURI(row['city_ascii'])
    if externalUri!= "":
        city = URIRef(externalUri)

        
    country = URIRef(namespace_str+row['country'].replace(" ", "_"))
    externalUri =  getExternalKGURI(row['country'])
    if externalUri!= "":
        country = URIRef(externalUri)
        
        
        
    g.add((city, RDF.type, ex.City))
    g.add((country, RDF.type, ex.Country))
    
    # Add city properties
    g.add((city, ex.name, Literal(row['city'])))
    g.add((city, ex.citySscii, Literal(row['city_ascii'])))
    g.add((city, ex.adminName, Literal(row['admin_name'])))
    g.add((city, ex.lattitude, Literal(row['lat'])))
    g.add((city, ex.longitude, Literal(row['lng'])))
    g.add((city, ex.population, Literal(row['population'])))
    g.add((city, ex.hasCountry, country))
    if (row['capital'] == row['capital']): #check for nan
        g.add((city, ex.capitalStatus, Literal(row['capital'])))

    # Add country data properties
    g.add((country, ex.iso2Code,  Literal(row['iso2'])))
    g.add((country, ex.iso3_code,  Literal(row['iso3'])))
    g.add((country, ex.name,  Literal(row['country'])))
    
    if (row['capital'] == 'primary'):
        g.add((country, ex.hasCapital, city))
    
# print(g.serialize(format="turtle").decode("utf-8"))
g.serialize(destination='lab3_data_graph_dpo.ttl', format='ttl')