### **Installing and Importing Modules as Needed**


In [0]:
#Installs necessary modules
!pip install rdflib
!pip install bs4
!pip install SPARQLWrapper



In [0]:
#Import RDFLib module and associated classes
import rdflib as rb
from rdflib import Literal, BNode, URIRef, Graph, Namespace
from rdflib.namespace import RDF, OWL, RDFS, FOAF, XSD, DC

#Import SPARQL
from SPARQLWrapper import SPARQLWrapper, JSON, CSV

#Import spaCy module
import spacy as sp

#Import pre-installed modules for various analysis
from urllib import request
import re
import os
import sys
import pandas as pd
import numpy as np

#Import BeautifulSoup for webscraping
from bs4 import BeautifulSoup

### **Parsing the Original Protoge Knowledge Graph**

Parsing the original turtle file exported from Protoge.  This file will be instantiated with all data properties that are extracted from each database.

In [0]:
g3 = Graph()
g3.parse('MIE1501-T5-P2-neighbourhood_crime_ontology.ttl',format = 'turtle')

<Graph identifier=N5ae2aa86765c43bba061ac86082a180a (<class 'rdflib.graph.Graph'>)>

### **Extracting Data URL'S**
URL's extracted directly from the provided websites.  They have been promptly named police_url and neighbourhood_url. 

In [0]:
police_url = 'https://opendata.arcgis.com/datasets/98f7dde610b54b9081dfca80be453ac9_0.csv?outSR=%7B%22latestWkid%22%3A3857%2C%22wkid%22%3A102100%7D'
neighbourhood_url = 'https://ckan0.cf.opendata.inter.prod-toronto.ca/download_resource/ef0239b1-832b-4d0b-a1f3-4153e53b189e?format=csv'

### **Downloading Data**

The function below directly extracts the data from each database and downloads it.  When the URL is entered as an argument and the function is called, each database (for neighbourhood profiling and crime analysis) will automatically be converted into a panda's dataframe.  

In [0]:
def download_data(url,txt_name): 
    file_open = request.urlopen(url)
    file_info = file_open.read()
    file_info_str = str(file_info)
    file_lines = file_info_str.split('\\n')
    new_file = open(txt_name,'w')

    count = 0
    for info in file_lines:
        count += 1
        new_file.write(info + '\n')
    new_file.close()

    return pd.read_csv(txt_name, nrows = count-2)

### **Defining Ontology Namespaces**

In [0]:
#Namespace definitions

sch = rb.Namespace('http://schema.org/')
xsd = rb.Namespace('http://www.w3.org/2001/XMLSchema#')
rdfs = rb.Namespace('http://www.w3.org/2000/01/rdf-schema#')
owl = rb.Namespace('http://www.w3.org/2002/07/owl#')
nbc = rb.Namespace('http://www.semanticweb.org/yiningzhou/ontologies/2019/11/NCA#')


# **1. Neighbourhood Dataset**


In the neighbourhood dataset, we eliminated all rows that were not relevant to our ontology.  The rows that we decided to keep pertain to Education, Income, Occupation and Age population.  This helps to filter the data that we need and to reduce the file size of the turtle that will be exported at the end of this program.   

In [0]:
neighbourhood_data = download_data(neighbourhood_url,'neighbourhood_data.txt')

neighbourhood_data = neighbourhood_data.replace(',','', regex = True)

neighbourhood_data.iloc[4,5:] = (neighbourhood_data.iloc[4,5:].replace('%','', regex = True)).astype(float)/100
neighbourhood_data.iloc[117,5:] = (neighbourhood_data.iloc[117,5:].replace('%','', regex = True)).astype(float)/100
neighbourhood_data.iloc[120,5:] = (neighbourhood_data.iloc[120,5:].replace('%','', regex = True)).astype(float)/100
neighbourhood_data.iloc[123,5:] = (neighbourhood_data.iloc[126,5:].replace('%','', regex = True)).astype(float)/100
neighbourhood_data.iloc[126,5:] = (neighbourhood_data.iloc[126,5:].replace('%','', regex = True)).astype(float)/100
neighbourhood_data.iloc[134,5:] = (neighbourhood_data.iloc[134,5:].replace('%','', regex = True)).astype(float)/100

neighbourhood_data.iloc[2:,5:] = neighbourhood_data.iloc[2:,5:].astype(float)

for i in range(neighbourhood_data.shape[0]-1,0,-1):
  if (i not in range(988,999)) and (i not in range(9,15)) and (i not in range(1922,1932)) and (i not in [1715,1716,1720,1725,1728]): 
    neighbourhood_data = neighbourhood_data.drop(neighbourhood_data.index[i])

neighbourhood_data = neighbourhood_data.dropna()
neighbourhood_data.reset_index(drop=True, inplace=True)
neighbourhood_data.head()

Unnamed: 0,b'_id,Category,Topic,Data Source,Characteristic,City of Toronto,Agincourt North,Agincourt South-Malvern West,Alderwood,Annex,Banbury-Don Mills,Bathurst Manor,Bay Street Corridor,Bayview Village,Bayview Woods-Steeles,Bedford Park-Nortown,Beechborough-Greenbrook,Bendale,Birchcliffe-Cliffside,Black Creek,Blake-Jones,Briar Hill-Belgravia,Bridle Path-Sunnybrook-York Mills,Broadview North,Brookhaven-Amesbury,Cabbagetown-South St. James Town,Caledonia-Fairbank,Casa Loma,Centennial Scarborough,Church-Yonge Corridor,Clairlea-Birchmount,Clanton Park,Cliffcrest,Corso Italia-Davenport,Danforth,Danforth East York,Don Valley Village,Dorset Park,Dovercourt-Wallace Emerson-Junction,Downsview-Roding-CFB,...,Regent Park,Rexdale-Kipling,Rockcliffe-Smythe,Roncesvalles,Rosedale-Moore Park,Rouge,Runnymede-Bloor West Village,Rustic,Scarborough Village,South Parkdale,South Riverdale,St.Andrew-Windfields,Steeles,Stonegate-Queensway,Tam O\'Shanter-Sullivan,Taylor-Massey,The Beaches,Thistletown-Beaumond Heights,Thorncliffe Park,Trinity-Bellwoods,University,Victoria Village,Waterfront Communities-The Island,West Hill,West Humber-Clairville,Westminster-Branson,Weston,Weston-Pelham Park,Wexford/Maryvale,Willowdale East,Willowdale West,Willowridge-Martingrove-Richview,Woburn,Woodbine Corridor,Woodbine-Lumsden,Wychwood,Yonge-Eglinton,Yonge-St.Clair,York University Heights,Yorkdale-Glen Park
0,10,Population,Age characteristics,Census Profile 98-316-X2016001,Children (0-14 years),398135.0,3840,3075,1760,2360,3605,2325,1695,2415,1515,4555,1120,4550,3345,4600,1405,1855,1435,1470,3360,800,1490,1200,2150,1260,4565,2765,2575,1910,1675,3020,4065,4250,4480,5725,...,1635,1640,3505,2300,2625,7960,2050,1940,3365,2115,3975,2645,3150,4060,3885,2950,3825,1730,5600,1745,565,2630,3650,4635,5060,3440,3100,1675,4215,5920,1785,3555,9625,2325,1165,1860,1800,1210,4045,1960
1,11,Population,Age characteristics,Census Profile 98-316-X2016001,Youth (15-24 years),340270.0,3705,3360,1235,3750,2730,1940,6860,2505,1635,3210,855,4605,2440,3290,885,1655,1425,1040,2460,1035,1220,1080,1850,5060,3495,1910,2145,1545,920,1450,3280,3155,3925,4340,...,1775,1355,2740,1305,2185,6700,960,1405,2360,2275,2445,2540,2685,2495,3225,1690,2075,1410,2455,1630,1485,2030,7840,3950,5445,3355,2225,1415,3650,6940,2230,2625,7660,1035,675,1320,1225,920,4750,1870
2,12,Population,Age characteristics,Census Profile 98-316-X2016001,Working Age (25-54 years),1229560.0,11305,9965,5220,15040,10810,6655,13065,10310,4490,8410,2750,12050,9075,8525,3605,6740,3090,5680,7625,5855,4570,4555,5030,18780,11940,7470,6070,6930,4370,7930,11860,10485,19790,15045,...,5575,4300,9255,7990,7925,18510,4605,3615,6685,12440,15025,6800,9500,10660,10675,7580,9590,4160,9020,9210,3590,7170,45105,10765,13845,10865,7785,5275,11380,25850,7480,8140,21945,6165,3790,6420,5860,5960,12290,5860
3,13,Population,Age characteristics,Census Profile 98-316-X2016001,Pre-retirement (55-64 years),336670.0,4230,3265,1825,3480,3555,2030,1760,2540,1825,3075,885,3535,3520,2425,940,1925,1525,1435,2185,1715,1350,1465,1955,3235,3360,1935,2445,1740,1195,2135,3145,3330,3935,4380,...,1100,1520,3090,1690,3030,6690,1215,1115,2095,2475,3120,2625,3515,3660,3510,1750,3140,1195,1665,1625,650,2270,4680,3785,3990,3500,2430,1415,3940,5460,2070,2905,6245,1625,1150,1595,1325,1540,2965,1810
4,14,Population,Age characteristics,Census Profile 98-316-X2016001,Seniors (65+ years),426945.0,6045,4105,2015,5910,6975,2940,2420,3615,3685,3980,965,5250,3910,2870,895,2100,1785,1865,2130,2250,1325,2675,2385,3010,3630,2375,2695,2015,1515,2655,4700,3780,4515,5535,...,730,1730,3655,1680,5140,6625,1245,1865,2225,2540,3330,3235,5755,4165,6160,1705,2955,1880,2365,2350,1320,3405,4635,4240,4980,5130,2445,1325,4715,6270,3370,4905,8010,1380,1095,3150,1600,2905,3530,3295


In [0]:
community = list(neighbourhood_data.columns[5:])
g1 = Graph()
# neighborhood
#g1.add((nbc.Community, RDF.type, owl.Class))
#g1.add((nbc.NeighbourhoodInfo, nbc.hasCommunity, nbc.Community))
new_community = []

n = 0
for item in community:
  item = re.sub('[^a-zA-Z0-9 \n\.]', '', item)
  item = item.replace(' ', '')
  new_community.append(item)

  for i in range(0,neighbourhood_data.shape[0]-1):
  # community
    community_name = rb.Namespace(nbc + str(item) + str(i))
    pos = new_community.index(item) + 5
    hood_id = neighbourhood_data.iloc[0,pos]
    g1.add((URIRef(community_name), RDF.type, nbc.Community))
    g1.add((URIRef(community_name), nbc.hasName, Literal(item)))
    g1.add((URIRef(community_name), nbc.hasHoodID, Literal(hood_id)))
    g1.add((URIRef(community_name), nbc.hasCategory, Literal(str(neighbourhood_data.iloc[i,1]))))
    g1.add((URIRef(community_name), nbc.hasTopic, Literal(str(neighbourhood_data.iloc[i,2]))))
    g1.add((URIRef(community_name), nbc.hasCharacteristic, Literal(str(neighbourhood_data.iloc[i,4]))))
    g1.add((URIRef(community_name), nbc.hasValue, Literal(str(neighbourhood_data.iloc[i, pos]))))


g1.serialize(destination='neighbor.ttl', format='turtle')

# **2. Police Dataset**

In the data below, we call the download_data function to directly extract the file from the specified URL.  Once this is done, the data goes under a cleaning process where 2014-2016 information is dropped from the file.  We are also dropping any rows that contain blanks.  The reason for this is to reduce the computational time of extracting the data and adding each instance to the knowledge graph.  

In [0]:
police_data = download_data(police_url,'police_data.txt')

#Eliminate reported years 2014-2016
police_data = police_data[police_data.reportedyear != 2014]
police_data = police_data[police_data.reportedyear != 2015]
police_data = police_data[police_data.reportedyear != 2016]

#Eliminate occurred years 2014-2016
police_data = police_data[police_data.occurrenceyear != 2014]
police_data = police_data[police_data.occurrenceyear != 2015]
police_data = police_data[police_data.occurrenceyear != 2016]

police_data = police_data.dropna()

police_data['MCI'] = police_data['MCI'].replace(["Theft Over","Assault","Robbery","Break and Enter","Auto Theft"], ["theftOver","assault","robbery","breakAndEnter","autoTheft"])
police_data.head()

Unnamed: 0,"b""\xef\xbb\xbfX",Y,Index_,event_unique_id,occurrencedate,reporteddate,premisetype,ucr_code,ucr_ext,offence,reportedyear,reportedmonth,reportedday,reporteddayofyear,reporteddayofweek,reportedhour,occurrenceyear,occurrencemonth,occurrenceday,occurrencedayofyear,occurrencedayofweek,occurrencehour,MCI,Division,Hood_ID,Neighbourhood,Lat,Long,ObjectId
7004,-79.492294,43.614082,13483,GO-2017482647,2017-03-18T06:17:00.000Z,2017-03-18T06:17:00.000Z,Apartment,1430,100,Assault,2017,March,18,77,Saturday,6,2017.0,March,18.0,77.0,Saturday,6,assault,D22,17,Mimico (includes Humber Bay Shores) (17),43.614082,-79.492294,12005
7008,-79.492294,43.614082,13484,GO-2017482647,2017-03-18T06:17:00.000Z,2017-03-18T06:17:00.000Z,Apartment,1420,100,Assault With Weapon,2017,March,18,77,Saturday,6,2017.0,March,18.0,77.0,Saturday,6,assault,D22,17,Mimico (includes Humber Bay Shores) (17),43.614082,-79.492294,12009
7012,-79.348854,43.773651,13485,GO-201784317,2017-01-14T18:40:00.000Z,2017-01-14T23:18:00.000Z,Outside,1430,100,Assault,2017,January,14,14,Saturday,23,2017.0,January,14.0,14.0,Saturday,18,assault,D33,53,Henry Farm (53),43.773651,-79.348854,12013
7013,-79.500862,43.763363,13486,GO-201784997,2017-01-14T21:40:00.000Z,2017-01-14T21:40:00.000Z,Commercial,1430,100,Assault,2017,January,14,14,Saturday,21,2017.0,January,14.0,14.0,Saturday,21,assault,D31,27,York University Heights (27),43.763363,-79.500862,12014
7015,-79.212387,43.744606,13487,GO-201745412,2017-01-08T16:00:00.000Z,2017-01-08T17:04:00.000Z,Apartment,1430,100,Assault,2017,January,8,8,Sunday,17,2017.0,January,8.0,8.0,Sunday,16,assault,D43,139,Scarborough Village (139),43.744606,-79.212387,12016


### **Adding in Data from Crime Database** 

Here, we created a unique case for each crime committed in the database.  This way, we can uniqely identify each crime with their respective crime types, premise types, occurance and reported times.  This will allow us to design the queries in an efficient manner.

- The crime data has been added to a separate graph function from the neighbourhood data.  Both graphs will be merged into one knowledge graph and exported into a single turtle file. 
- Please note that due to the file size of the CSV database, we eliminated all data from 2014-2016.  The program kept crashing and running into memory errors if otherwise kept.  Time frames and intervals will therefore be measured year over year.  
- All crime types have been renamed to fit the ontological criteria. 

In [0]:
g2 = Graph()
#Crime Types
g2.add((nbc.CrimeType,nbc.hasInstanceOf,Literal("theftOver")))
g2.add((nbc.CrimeType,nbc.hasInstanceOf,Literal("assault")))
g2.add((nbc.CrimeType,nbc.hasInstanceOf,Literal("robbery")))
g2.add((nbc.CrimeType,nbc.hasInstanceOf,Literal("breakAndEnter")))
g2.add((nbc.CrimeType,nbc.hasInstanceOf,Literal("autoTheft")))

#Premise Types
g2.add((nbc.PremiseType,nbc.hasPremise,Literal("Apartment")))
g2.add((nbc.PremiseType,nbc.hasPremise,Literal("Outside")))
g2.add((nbc.PremiseType,nbc.hasPremise,Literal("House")))
g2.add((nbc.PremiseType,nbc.hasPremise,Literal("Commercial")))
g2.add((nbc.PremiseType,nbc.hasPremise,Literal("Other")))

n = 0
for i in range(1,police_data.shape[0]):
  try:
    
    n += 1
    g2.add((nbc["c" + str(n)],RDF.type,nbc.Crime)) #Each case is of type Crime
    g2.add((nbc["c" + str(n)],nbc.offenceOf,Literal(police_data.loc[i, 'MCI']))) #Offence commited of each case
    g2.add((nbc["c" + str(n)],nbc.committedInHood,Literal(police_data.loc[i, 'Hood_ID']))) #Offence commited of each case
    g2.add((nbc["c" + str(n)],nbc.committedInNeighbourhood,Literal(police_data.loc[i, 'Neighbourhood']))) #Offence commited of each case
    g2.add((nbc["c" + str(n)],nbc.committedOnPremiseType,Literal(police_data.loc[i, 'premisetype']))) #Offence commited of each case
    g2.add((nbc["c" + str(n)],nbc.hasLatitude,Literal(police_data.loc[i, 'Lat']))) #Latitude
    g2.add((nbc["c" + str(n)],nbc.hasLongtitude,Literal(police_data.loc[i, 'Long']))) #Longtitude

    #OccurredTime
    g2.add((nbc["c" + str(n)],nbc.hasOccurYear,Literal(police_data.loc[i, 'occurrenceyear']))) #Year
    g2.add((nbc["c" + str(n)],nbc.hasOccurMonth,Literal(police_data.loc[i, 'occurrencemonth']))) #Month
    g2.add((nbc["c" + str(n)],nbc.hasOccurDayOfWeek,Literal(police_data.loc[i, 'occurrencedayofweek']))) #Day of Week
    g2.add((nbc["c" + str(n)],nbc.hasOccurHour,Literal(police_data.loc[i, 'occurrencehour']))) #Hour 
  
  except:
    pass

g2.serialize("crime.ttl",format = 'turtle')

### **Exporting Combined KnowledgeGraphs into Turtle File**
Here, we combined the 3 knowledge graphs into 1 so that we may export 1 consolidated Turtle file.  Running this will add the Turtle to the local directory, where it may be downloaded.

In [0]:
g = (g3+g1+g2)
g.serialize(destination = "neighbourhood_crime_ontology.ttl", format = 'turtle')