### Project: INM713-coursework
**Author**: Zac Detorakis (zacharias.detorakis@city.ac.uk)

**Version**: 1.0<br>
**Date**: 15/03/2021

In [43]:
# Import Libraries
from rdflib import Graph
from rdflib import URIRef, BNode, Literal
from rdflib import Namespace
from rdflib.namespace import OWL, RDF, RDFS, FOAF, XSD
# from rdflib.util import guess_format
import pandas as pd
import math
from SPARQLWrapper import SPARQLWrapper, JSON
from stringcmp import isub
from lookup import DBpediaLookup
import yake
import re
# import csv

import owlrl


# Instanciate the class

In [44]:
class FinalCoursework(object):
    """
    This will contain the solution for the coursework for the SW&KGT
    """
    
    def __init__(self, input_csv):
        
        # The file containing the data to load in the KG
        self.file = input_csv
        
        #Intialise the KG
        self.g = Graph()
        
        #setup the ontology IRI used for the courseworkontology..
        self.zdetor_ns_str= "https://www.city.ac.uk/ds/inm713/zacharias_detorakis/"
        #Special namspaces class to create directly URIRefs in python.           
        self.zdetor = Namespace(self.zdetor_ns_str)
        #Prefixes for the serialization
        self.g.bind("zdetor", self.zdetor)
        
        self.classStringToURI = dict()
        
        
        #read the raw data into a dataframe
#         self.df = pd.read_csv(filepath_or_buffer = self.file, sep=',', quotechar='"',escapechar="\\")
        self.df = []
    def is_nan(self, x):
        return (x != x)

if __name__ == '__main__':
    input_csv = "INM713_coursework_data_pizza_8358_1_reduced.csv"
#     input_csv = "INM713_coursework_data_pizza_8358_1_reduced - small.csv"
    
    solution = FinalCoursework(input_csv)
    solution.df = pd.read_csv(filepath_or_buffer = solution.file, sep=',', quotechar='"',escapechar="\\")
    
        

# 2. DATA PRE-PROCESSING
## 2.1 Restaurants

First we inspect the restaurants and we see that there are more addresses than restaurants. Based on our ontology a restaurant can only be in one address therefore we need to idenditify those restaurant and create a new URI for them.

In [3]:
print(f"Number of unique restaurant names: {len(solution.df.groupby(['name']))}")
print(f"Number of unique address lines: {len(solution.df.groupby(['name','address']))}")

Number of unique restaurant names: 933
Number of unique address lines: 989


In [4]:
def is_nan(self, x):
        return (x != x)

def findNewRestaurantName(original_value,mapping_dict):
    """
    A function used to map a given value to a new one as defined in the mapping dictionary
    ...

    Attributes
    ----------
    original_value : str
        the original value to be mapped. In this case this will be the concatenated field (name+address)
    mapping_dict : dict
        this is a dictionary where the 'key' is the original value and the 'value' is the new value it maps to
    """
    try:
        return mapping_dict[original_value]
    except:
        return 'invalid'

def createNewRestaurantNames(original_df):
    """
    A function used to create a new name for restaurants that share the same name but based on the addresses seem to be different. The new name will be created as a new 'restaurant_name' column in the df 
    ...

    Attributes
    ----------
    original_df : dataframe
        the original dataframe to be updated with the new_name column
    """
    
    # group the restaurant by name and filter out any restaurant with exactly one address
    df_group = pd.DataFrame(original_df.groupby('name')['address'].nunique())
    df_dup_restaurants = df_group[df_group.address>1]

    # next we create a new version of the df to add the new name column
    new_df = solution.df
    # for now populate the column with a concatenation of name and address
    new_df['restaurant_name']=new_df.apply(lambda x:'%s_%s' % (x['name'],x['address']),axis=1)

    # next we create a termporary dataframe to store the name and concatenated column. drop the duplicates (from the multiple menu items) and sort the df
    temp = new_df[['name', 'restaurant_name']].drop_duplicates()
    temp.sort_values(by='name',inplace=True)

    # finally create a dictionary mappin the concatenated field to a new restaurant name by 
    # - adding a sequence number at the end of the duplicates or 
    # - reusing the existing name for restaurants that appear only once
    prev_name = ''
    incr = 1
    new_name_dict = {}
    for index, row in temp.iterrows():
        if row['name'] == prev_name:
            incr += 1
        else:
            incr = 1

        if row['name'] in str(df_dup_restaurants.index):
            new_name_dict[row['restaurant_name']]= row['name'] + '___' + str(incr)
            prev_name = row['name']
        else:
            new_name_dict[row['restaurant_name']]= row['name']

    # Finally apply the function to map the concatenated field to the new name
    new_df['restaurant_name'] = new_df['restaurant_name'].apply(lambda x: findNewRestaurantName(x,new_name_dict))
    # new_df.to_csv("temp.csv")
    return new_df

In [5]:
solution.df = createNewRestaurantNames(solution.df)
solution.df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description,restaurant_name
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Bianca Pizza,22.5,USD,,Little Pizza Paradise
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Cheese Pizza,18.95,USD,,Little Pizza Paradise
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,,The Brentwood
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,,The Brentwood
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes",The Brentwood


## 2.2 States

First of all we need to perform some exploratory data analysis before we start loading the data to ensure the data consistency and perhaps perform a bit of data cleansing.
We will start with the address part as that is the easiest one to profile.

Given that the addresses are unique per restaurant and then repeated for all menu items of that restaurant we will get a subsection of the df to better assess the extent of problematic records that need fixing. As per our ontology, the address properties are:
* address (i.e. representing the address line)
* city
* country
* state and
* postcode

In [6]:
def convertPostCodeStringToPostCodes(post_code):

    #create a new empty list for the post codes
    post_code_list = []
    
    #separate the post codes by commas first
    separate_val = re.split(r'[,( ]\s*', post_code.replace("- ", "-").replace("– ","–")) 
    for val in separate_val:

        #the within each value we check if we have a range
        if ((val.find('–')>0) or (val.find('-')>0)) :
            pc_range = re.split(r'[-–]\s*', val)
#             print(pc_range)
            #if we do have a range then we create all the post codes in that range and append them to the list
            try:
                for pc in range(int(pc_range[0]),int(pc_range[1])+1):
                #we append the post codes as strings and if need be we add leading zeros to make the string 5 characters long
                    post_code_list.append(str(pc).zfill(5))
            except:
                pass
                    
        else:
            try:
                int(val)
                post_code_list.append(val)
            except:
                pass
    return post_code_list

def createPostCode2StateMap():
    
    
    #first we create a list of all the post codes and the cities from DBpedia
    endpoint_url = "http://dbpedia.org/sparql"

    sparqlw = SPARQLWrapper(endpoint_url)
    sparqlw.setReturnFormat(JSON)

    query = """
    SELECT ?city ?state str(?cityName) ?iso2StateCode ?postCode
    WHERE {
        ?state  dct:subject dbc:States_of_the_United_States;
                dbo:postalCode ?iso2StateCode.
        ?city   a dbo:City;
                dbo:subdivision ?state;
                rdfs:label ?cityName;
                dbo:postalCode ?postCode.

    FILTER (?iso2StateCode != "").
    FILTER langMatches( lang(?cityName), "en" )
    }
    """
    sparqlw.setQuery(query)
    results = sparqlw.query().convert()
    
    #next we create a temporary dictionary with the post code string and the state code. However the post code in this dictionary is sometimes a range or a combination of post codes and ranges
    temp_post_dict = dict()
    for i in results['results']['bindings']:
        temp_post_dict[i['postCode']['value']] = i['iso2StateCode']['value']
    
    # Finally we reprocess the temp_post_dict post codes so we have extract all the post codes from the ranges and still map them to the same state
    postcode_to_state = dict()
    for post_code in temp_post_dict.keys():
        for pc in convertPostCodeStringToPostCodes(post_code):
            postcode_to_state[pc] = temp_post_dict[post_code]
    
    
    # next we create a dictionary with cities and states        
    city_to_state = dict()
    for row in results['results']['bindings']:
        city_to_state[row['callret-2']['value']] = row['iso2StateCode']['value']
    
    return postcode_to_state, city_to_state

def findStateByPostCodeCity(postcode, city, postcode_to_state, city_to_state):
    """
    A function used to map a given post code or city in the US to the respective state. Exact matches are expected instead of lexical similarity
    ...

    Attributes
    ----------
    postcode : str
        the post code to be mapped
    city : str
        the city to be mapped. Unlike the post code the city can be matched if it exists as a substring in the dictionary
    """
    
    try:
        return postcode_to_state[postcode]
    except:
        try:
            lst = [value for key, value in city_to_state.items() if city.lower() in key.lower()]
            return max(set(lst), key=lst.count)
        except:
            pass
            

postcode_to_state, city_to_state = createPostCode2StateMap()

solution.df['state_code'] = solution.df.apply(lambda x: x.state if len(x.state)==2 else findStateByPostCodeCity(x.postcode,x.city, postcode_to_state, city_to_state),axis=1)

In [7]:
solution.df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description,restaurant_name,state_code
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Bianca Pizza,22.5,USD,,Little Pizza Paradise,OR
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Cheese Pizza,18.95,USD,,Little Pizza Paradise,OR
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,,The Brentwood,CA
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,,The Brentwood,CA
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes",The Brentwood,CA


## 2.3 Addresses
We need to add one more column to the dataframe by concatenating the address column with the state column in order to generate unique values to use for the generation of the address class URI

In [8]:
solution.df['address_id'] = solution.df.apply(lambda x: x.state+'_'+x.address,axis=1)
solution.df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description,restaurant_name,state_code,address_id
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Bianca Pizza,22.5,USD,,Little Pizza Paradise,OR,OR_Cascade Village Mall Across From Target
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Cheese Pizza,18.95,USD,,Little Pizza Paradise,OR,OR_Cascade Village Mall Across From Target
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,,The Brentwood,CA,Brentwood_148 S Barrington Ave
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,,The Brentwood,CA,Brentwood_148 S Barrington Ave
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes",The Brentwood,CA,Brentwood_148 S Barrington Ave


## 2.4 Pizza
We need to add one more column to the dataframe by concatenating the menu item and the restaurant name in order to create a unique URI for pizzas that are served at different restaurants (i.e. if a pizza is served at 2 different restaurants has the same name we need to create to instances instead of one)

In [9]:
solution.df['pizza_name'] = solution.df.apply(lambda x: x.restaurant_name+'_'+str(x['menu item']),axis=1)
solution.df.head()

Unnamed: 0,name,address,city,country,postcode,state,categories,menu item,item value,currency,item description,restaurant_name,state_code,address_id,pizza_name
0,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Bianca Pizza,22.5,USD,,Little Pizza Paradise,OR,OR_Cascade Village Mall Across From Target,Little Pizza Paradise_Bianca Pizza
1,Little Pizza Paradise,Cascade Village Mall Across From Target,Bend,US,97701,OR,Pizza Place,Cheese Pizza,18.95,USD,,Little Pizza Paradise,OR,OR_Cascade Village Mall Across From Target,Little Pizza Paradise_Cheese Pizza
2,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Margherita",12.0,USD,,The Brentwood,CA,Brentwood_148 S Barrington Ave,"The Brentwood_Pizza, Margherita"
3,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Mushroom",13.0,USD,,The Brentwood,CA,Brentwood_148 S Barrington Ave,"The Brentwood_Pizza, Mushroom"
4,The Brentwood,148 S Barrington Ave,Los Angeles,US,90049,Brentwood,"American Restaurant,Bar,Bakery","Pizza, Puttenesca",13.0,USD,"Olives, onions, capers, tomatoes",The Brentwood,CA,Brentwood_148 S Barrington Ave,"The Brentwood_Pizza, Puttenesca"


# CREATE TRIPLES

In [17]:
def getExternalKGURI(name):
    '''
    Approximate solution: We get the entity with highest lexical similarity
    The use of context may be necessary in some cases        
    '''
    
    dbpedia = DBpediaLookup()
    entities = dbpedia.getKGEntities(name, 5)
    #print("Entities from DBPedia:")
    current_sim = -1
    current_uri=''
    for ent in entities:           
        isub_score = isub(name, ent.label) 
        if current_sim < isub_score:
            current_uri = ent.ident
            current_sim = isub_score

#     print(current_uri)
    return current_uri

def loadStateISO2UrisFromDBPedia(classStringToURI):
    endpoint_url = "http://dbpedia.org/sparql"

    sparqlw = SPARQLWrapper(endpoint_url)
    sparqlw.setReturnFormat(JSON)
    
    #Create the sparql query to get the URIs for the states from the iso2 state code
    state_iso2_query = """
    SELECT ?state ?iso2StateCode
    WHERE {
        ?state  dct:subject dbc:States_of_the_United_States;
                dbo:postalCode ?iso2StateCode.
    FILTER (?iso2StateCode != "").
    }
    """
    
    sparqlw.setQuery(state_iso2_query)
    state_results = sparqlw.query().convert()

    # create a new key for the states and...
    classStringToURI['state_code'] = dict()

    # the the code to URI mappings
    for state in state_results['results']['bindings']:
        classStringToURI['state_code'][state['iso2StateCode']['value'].lower()] = state['state']['value']

def mappingToCreateTypeTriple(df, subject_column, class_type, use_external_uri, classStringToURI):
    
    # First we create a new key for the class with a value of another dictionary
    if subject_column != 'state_code':
        classStringToURI[subject_column] = dict()

    # Then we iterate through the rows in the subject column and either reuse an existing URI of contruct it from scratch
    for subject in df[subject_column]:
        
        #We use the subject_column value to create the fresh URI if this if the first time we see that value. 
        # If we've seen the value before we do not do anything since we've already added the tripple to the graph
        #################################### CHECK FOR NULL VALUES
        try:
            if subject.lower() not in classStringToURI[subject_column]:
                if use_external_uri:
                    entity_uri =  getExternalKGURI(subject.lower())
                else:
                    entity_uri = solution.zdetor_ns_str + subject.replace(" ", "_").replace("'","_").replace("&","_").replace("|","_")
                classStringToURI[subject_column][subject.lower()] = entity_uri
            else:
                entity_uri = classStringToURI[subject_column][subject.lower()]
                
            #Add the tripple to the KG
            solution.g.add((URIRef(entity_uri), RDF.type, class_type))
        except:
            pass

        
def mappingToCreateLiteralTriple(df, subject_column, object_column, predicate, datatype, classStringToURI):

    for subject, lit_value in zip(df[subject_column], df[object_column]):

        # check if the value is empty and if it is do not create the litteral value
        if solution.is_nan(lit_value) or lit_value==None or lit_value=="":
            pass

        else:
            try:
                #Uri as already created
                entity_uri=classStringToURI[subject_column][subject.lower()]

                #Literal
                lit = Literal(lit_value, datatype=datatype)

                #New triple
                solution.g.add((URIRef(entity_uri), predicate, lit))
            except:
                pass


def mappingToCreateObjectTriple(df, subject_column, object_column, predicate, classStringToURI):

    for subject, object in zip(df[subject_column], df[object_column]):
#         print(subject, object)

        if solution.is_nan(object) or object==None or object=="":
            pass

        else:
            #Uri as already created
            subject_uri=classStringToURI[subject_column][subject.lower()]
            object_uri=classStringToURI[object_column][object.lower()]

            #New triple
            solution.g.add((URIRef(subject_uri), predicate, URIRef(object_uri)))

## Create Classes

In [11]:
# print(solution.g.serialize(format="turtle").decode("utf-8"))
# solution.classStringToURI

In [12]:
# def CovertCSVToRDF(solution.df):
# this is an empty dictionary of the format {'class':{'string': uri}}

print('CREATING CLASSES:')

if 'restaurant_name' in solution.df:
    mappingToCreateTypeTriple(solution.df,'restaurant_name',solution.zdetor.Restaurant, False, solution.classStringToURI)
print('Restaurants complete')

solution.g.bind("dpo", Namespace("http://dbpedia.org/resource/"))
if 'city' in solution.df:
    mappingToCreateTypeTriple(solution.df,'city',solution.zdetor.City, True, solution.classStringToURI)
print('Cities complete')
    
if 'country' in solution.df:
    mappingToCreateTypeTriple(solution.df,'country',solution.zdetor.Country, True, solution.classStringToURI)
print('Countries complete')

loadStateISO2UrisFromDBPedia(solution.classStringToURI)
if 'state_code' in solution.df:
    mappingToCreateTypeTriple(solution.df, 'state_code',solution.zdetor.State, True, solution.classStringToURI)
print('States complete')
    
if 'address_id' in solution.df:
    mappingToCreateTypeTriple(solution.df,'address_id',solution.zdetor.Address, False, solution.classStringToURI)
print('Addresses complete')

if 'pizza_name' in solution.df:
    mappingToCreateTypeTriple(solution.df,'pizza_name',solution.zdetor.Pizza, False, solution.classStringToURI)
    mappingToCreateTypeTriple(solution.df,'pizza_name',solution.zdetor.MenuItem, False, solution.classStringToURI)
print('Pizzas complete')

CREATING CLASSES:
Restaurants complete
Cities complete
Countries complete
States complete
Addresses complete
Pizzas complete


## Create Object Properties

In [13]:
if 'name' in solution.df:
    mappingToCreateObjectTriple(solution.df, 'restaurant_name','address_id',solution.zdetor.hasAddress, solution.classStringToURI)
    if 'pizza_name' in solution.df:
        mappingToCreateObjectTriple(solution.df, 'restaurant_name','pizza_name',solution.zdetor.hasMenuItem, solution.classStringToURI)

if 'address_id' in solution.df:
    mappingToCreateObjectTriple(solution.df, 'address_id','city',solution.zdetor.hasCity, solution.classStringToURI)
    mappingToCreateObjectTriple(solution.df, 'address_id','state_code',solution.zdetor.hasState, solution.classStringToURI)
    mappingToCreateObjectTriple(solution.df, 'address_id','country',solution.zdetor.hasCountry, solution.classStringToURI)

## Create Literals

In [18]:
if 'name' in solution.df:
    mappingToCreateLiteralTriple(solution.df, 'restaurant_name','name',solution.zdetor.restaurantName, XSD.string, solution.classStringToURI)
    
if 'city' in solution.df:
    mappingToCreateLiteralTriple(solution.df, 'city','city',solution.zdetor.name, XSD.string, solution.classStringToURI)
print('Cities complete')
    
if 'country' in solution.df:
    mappingToCreateLiteralTriple(solution.df, 'country','country',solution.zdetor.name, XSD.string, solution.classStringToURI)
print('Countries complete')

if 'state_code' in solution.df:
    mappingToCreateLiteralTriple(solution.df, 'state_code','state_code',solution.zdetor.name, XSD.string, solution.classStringToURI)
print('States complete')
    
if 'address_id' in solution.df:
    mappingToCreateLiteralTriple(solution.df, 'address_id','address',solution.zdetor.addressLine, XSD.string, solution.classStringToURI)
    mappingToCreateLiteralTriple(solution.df, 'address_id','postcode',solution.zdetor.postCode, XSD.string, solution.classStringToURI)
    
if 'pizza_name' in solution.df:
    mappingToCreateLiteralTriple(solution.df, 'pizza_name','item value',solution.zdetor.price, XSD.float, solution.classStringToURI)
    mappingToCreateLiteralTriple(solution.df, 'pizza_name','currency',solution.zdetor.currency, XSD.string, solution.classStringToURI)
    mappingToCreateLiteralTriple(solution.df, 'pizza_name','menu item',solution.zdetor.name, XSD.string, solution.classStringToURI)
    mappingToCreateLiteralTriple(solution.df, 'pizza_name','item description',solution.zdetor.description, XSD.string, solution.classStringToURI)

Cities complete
Countries complete
States complete


# Categories

In [19]:
# concatenate all the values from the categories column to create the string to perform the NLP on

def createListOfFrequentTerms(df, column, max_ngram_size = 2, numOfKeywords = 40):
    concat_string = ''

    for val in df[column]:
        if solution.is_nan(val) or val==None or val=="":
            pass
        else:
            concat_string = concat_string + ', ' + str(val)

    kw_extractor = yake.KeywordExtractor()

    # text = """spaCy is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython. The library is published under the MIT license and its main developers are Matthew Honnibal and Ines Montani, the founders of the software company Explosion."""
    text = concat_string
    language = "en"
    deduplication_threshold = 0.9
    custom_kw_extractor = yake.KeywordExtractor(lan=language, n=max_ngram_size, dedupLim=deduplication_threshold, top=numOfKeywords, features=None)
    keywords = custom_kw_extractor.extract_keywords(text)

    sorted_list = []
    for kw in keywords:
        sorted_list.append(kw[0])
        print(kw)
    return sorted_list

# frequent_restaurant_categories = createListOfFrequentTerms(solution.df, 'item description', numOfKeywords = 10)

# frequent_pizza_classes = createListOfFrequentTerms(solution.df, 'menu item', numOfKeywords = 20)

# frequent_toppings = createListOfFrequentTerms(solution.df, 'item description', max_ngram_size=2, numOfKeywords = 40)

In [20]:
#Method from owlready
from owlready2 import *

def getClasses(onto):        
        return onto.classes()
    
def getOntoClassesByTerm(urionto, prefix, parent_class):

    # load the ontology
    onto = get_ontology(urionto).load()
    
    # get all ontology classes
    entities = list(getClasses(onto))

    
    #create a dictionary with subclasses of the parent_class. Assuming the name of the parent class is there as a suffix in the subclass
    classes = dict()
    for entity in entities:
        #expectein the name of the parent class to appear in the subclass name but NOT immediately after the prefix
        if str(entity).find(parent_class)>len(prefix)+1:
            classes[str(entity).replace(prefix+".","").replace(parent_class,"").lower()] = str(entity).replace('zdetor.',solution.zdetor_ns_str)
    return classes

categories = getOntoClassesByTerm('zdetor.owl','zdetor','Restaurant')
pizzas = getOntoClassesByTerm('zdetor.owl','zdetor','Pizza')
toppings = getOntoClassesByTerm('zdetor.owl','zdetor','Topping')



In [21]:
toppings

{'basil': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/BasilTopping',
 'pizza': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/PizzaTopping',
 'bbq': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/BbqTopping',
 'feta': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/FetaTopping',
 'greenpeppers': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/GreenPeppersTopping',
 'mozzarella': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/MozzarellaTopping',
 'mushrooms': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/MushroomsTopping',
 'onion': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/OnionTopping',
 'parmezan': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/ParmezanTopping',
 'pepperoni': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/PepperoniTopping',
 'redpeppers': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/RedPeppersTopping',
 'ricotta': 'https://www.city.ac.uk/ds/inm713/zacharias_detorakis/RicottaTopping'

In [22]:
def mappingToCreateObjectProperty(df, subject_column, object_column, object_dict, classStringToURI, predicate = RDF.type):

    for subject, object in zip(df[subject_column], df[object_column]):

        if solution.is_nan(object) or object==None or object=="":
            pass

        else:
            separate_val = set(re.split(r'[,( ]\s*', object.lower()))
#             print(subject, object, separate_val)
            for val in separate_val:
                try:
#                     print(val)
                    subject_uri=classStringToURI[subject_column][subject.lower()]
                    solution.g.add((URIRef(subject_uri), predicate, URIRef(object_dict[val])))
                except:
                    pass


In [23]:
mappingToCreateObjectProperty(solution.df,'restaurant_name','categories',categories,solution.classStringToURI)
mappingToCreateObjectProperty(solution.df,'pizza_name','menu item',pizzas,solution.classStringToURI)
mappingToCreateObjectProperty(solution.df,'pizza_name','item description',toppings,solution.classStringToURI,solution.zdetor.hasTopping)

In [24]:
# print(solution.g.serialize(format="turtle").decode("utf-8"))

In [38]:
def saveGraph(graph, file_output):

    graph.serialize(destination=file_output, format='ttl')

def performReasoning(graph, ontology_file):

    print("Triples including ontology: '" + str(len(graph)) + "'.")
    
    #We should load the ontology first
    graph.load(ontology_file,  format='ttl') #e.g., format=ttl


    #We apply reasoning and expand the graph with new triples 
    owlrl.DeductiveClosure(owlrl.OWLRL_Semantics, axiomatic_triples=False, datatype_axioms=False).expand(graph)

    print("Triples after OWL 2 RL reasoning: '" + str(len(graph)) + "'.")

# 3 REASONING

In [30]:
saveGraph(solution.g, 'cw-data.ttl')

## 3.1 Perform Reasoning and save the results

In [39]:
performReasoning(solution.g,'zdetor.ttl')
saveGraph(solution.g, 'cw-data-reasoned.ttl')

Triples including ontology: '53986'.
Triples after OWL 2 RL reasoning: '81942'.


In [40]:
saveGraph(solution.g, 'cw-data-reasoned.ttl')

In [52]:
new_g = Graph()
new_g.parse("cw-data-reasoned.ttl", format="ttl")
new_g.parse("cw-data.ttl", format="ttl")
print("Loaded '" + str(len(new_g)) + "' triples.")

Loaded '81942' triples.


In [71]:
qres = new_g.query(
"""
SELECT ?pizzaName ?topping
WHERE{
    ?pizza a zdetor:Pizza;
            zdetor:name ?pizzaName;
            zdetor:isMenuItemOf ?restaurant;
            zdetor:hasTopping ?topping.
#FILTER (?pizza zdetor:hasTopping zdetor:TomatoTopping').
}

""")

#Single row with one boolean vale
for row in qres:
    print(f"({str(row.pizzaName)}, {str(row.topping)}")

(Taco Pizza (medium), https://www.city.ac.uk/ds/inm713/zacharias_detorakis/TomatoTopping
(Taco Pizza (medium), https://www.city.ac.uk/ds/inm713/zacharias_detorakis/MozzarellaTopping
(Deep Pan Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/PizzaTopping
(New Chicken Wing Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/MozzarellaTopping
(Whole Pizza Pie, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/PepperoniTopping
(Bolognia Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/BasilTopping
(Carne Classica Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/PepperoniTopping
(Carne Classica Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/TomatoTopping
(Carne Classica Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/MozzarellaTopping
(Carne Classica Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/PizzaTopping
(Pesto Artichoke Pizza, https://www.city.ac.uk/ds/inm713/zacharias_detorakis/MozzarellaTopping
(Pesto 