# Import Information

In [37]:
#Installs necessary modules
!pip install rdflib
!pip install bs4
!pip install SPARQLWrapper
!pip install pywikibot



In [2]:
#Import RDFLib module and associated classes
import rdflib as rb
from rdflib import Literal, BNode, URIRef, Graph, Namespace
from rdflib.namespace import RDF, OWL, RDFS, FOAF, XSD, DC

#Import SPARQL
from SPARQLWrapper import SPARQLWrapper, JSON, CSV

#Import spaCy module
import spacy as sp

#Import pre-installed modules for various analysis
import urllib.request
import re
import os
import sys
import pandas as pd

#Import BeautifulSoup for webscraping
from bs4 import BeautifulSoup

In [3]:
#Namespace definitions
pr = rb.Namespace("http://ontology.eil.utoronto.ca/MIE1501/publicity.owl#")
sch = rb.Namespace("http://schema.org/")
xsd = rb.Namespace("http://www.w3.org/2001/XMLSchema#")
rdfs = rb.Namespace("http://www.w3.org/2000/01/rdf-schema#")
owl = rb.Namespace("http://www.w3.org/2002/07/owl#")
wdt = rb.Namespace("https://www.wikidata.org/wiki/Property_talk:")

# Search For Information
These entity functions look for different objects associated to a class.  If the object is found in the article, it is to be instatiated and appended to a list.

In [4]:
#Entity functions
#Each function looks for the desired information to be extracted.

def facultySearch(fac_token):
    if fac_token.label_ == "PERSON" and (" " in fac_token.text):
        fac_token = fac_token.text
        fac_token = fac_token.replace("</strong","")
        fac_token = fac_token.replace("</p","")
        fac_token = fac_token.replace(">","")
        person.append(fac_token)
        person.append("Mark S. Fox")

def dateSearch(dat_token):
    if dat_token.label_ == "DATE" and any(k in str(dat_token) for k in date_words):
        datetime.append(dat_token.text)

def citySearch(cit_token):
    try:
        #This file should be downloaded and added to the interpretor's directory.  If not, the search will not be as accurate.
        #This is a giant list of existing Provinces and States all over the world.  This will help condense the GPE list.
        #Please do not change the file name!
        with open('2019-1 SubdivisionCodes.txt') as f:
            if (cit_token.label_ == "GPE") and (cit_token.text not in cty_text) and (not any(k in str(cit_token) for k in num_words)) and (cit_token.text not in f.read()):
                city.append(cit_token.text)
                city.remove("Surgery")
    except:
        if (cit_token.label_ == "GPE") and (cit_token.text not in cty_text) and (not any(k in str(cit_token) for k in num_words)):
            city.append(cit_token.text)
            
def countrySearch(cty_token):
    if (cty_token.label_ == "GPE") and (cty_token.text in cty_text):
        country.append(cty_token.text)
        
def departmentSearch(dep_token):
    if (dep_token.label_== "ORG") and (dep_token.text in dep_text) and (not any(k in str(dep_token) for k in null_words)):
        department.append(dep_token.text)
        department.append("Computer science")
        if ">" in department:
            department.remove(">")
        if "Cell" in department:
            department.remove("Cell")

# Add Data to KnowledgeGraph
This is for the first knowledge graph from Assignment 1.

In [5]:
#Once information is extracted from each search, add it to the UofT knowledgegraph
def addObject():
    
    master_list = ["MENTION--TYPE--CLASS\n","<b>MENTION</b>--<b>TYPE</b>--<b>CLASS</b>\n"]
    
    #Adding mentions -------------------------------------------------------------
    g.add((pr.Mention, RDF.type, owl.Class))
    g.add((pr.Mention, pr.source, pr.Source))
    g.add((pr.Mention, pr.person, pr.Person))
    g.add((pr.Mention, pr.topic, xsd.string))
    g.add((pr.Mention, pr.department, pr.Department))
    g.add((pr.Mention, pr.date, xsd.datetime))
    
    g.add((pr.Source, sch.name, xsd.string))
    g.add((pr.Source, sch.description, xsd.string))
    g.add((pr.Source, pr.inCity, sch.City))
    g.add((pr.Source, sch.countryOfOrigin, sch.Country))
    
    g.add((pr.Department, rdfs.subClassOf, sch.Organization))
    g.add((pr.Department, sch.name, xsd.string))
    g.add((pr.Department, sch.description, xsd.string))
    
    for s,p,o in g:
        gr = str(s) + "--" + str(p) + "--" + str(o) + "\n"
        master_list.append(gr)
        
    #Adding mention---------------------------------------------------------------
    m = 0
    for i in list(dict.fromkeys(person)): #Adding person
        m += 1
        add_person = Literal(i)
        g.add((pr["p" + str(m)], pr.name,add_person))
        g.add((pr.Person, RDF.type,pr["p" + str(m)]))
        m_list.append(m)

    g.add((pr.m1, pr.topic, Literal(topic_string))) #Adding topic description
    
    m = 0
    for i in list(dict.fromkeys(datetime)):   #Adding all dates mentioned
        m += 1
        add_date = Literal(i)
        g.add((pr["date" + str(m)], pr.date, add_date))
        m_list.append(m)
    
    #Adding source---------------------------------------------------------------
    g.add((pr.s1, sch.name, Literal(source)))
    g.add((pr.Source, RDF.type, pr.s1))
    
    s_list.append(1)
    g.add((pr.s2, sch.name, Literal("CBC News")))
    g.add((pr.Source, RDF.type, pr.s2))
    
    s_list.append(2)
    
    s = 0
    for i in list(dict.fromkeys(city)): #Adding in city
        s += 1
        add_city = Literal(i)
        g.add((pr["city" + str(s)], pr.name, add_city))
        g.add((pr.City,RDF.type,pr["city" + str(s)]))
        s_list.append(s)

    s = 0
    for i in list(dict.fromkeys(country)): #Adding in countries
        s += 1
        add_country = Literal(i)
        g.add((pr["country" + str(s)], sch.name, add_country))
        g.add((pr.Country,RDF.type, pr["country" + str(s)]))
        s_list.append(s)
        
    #Adding Department-----------------------------------------------------------
    d = 0
    for i in list(dict.fromkeys(department)): #Adding department
        d += 1
        add_department = Literal(i)
        g.add((pr["d" + str(d)], sch.name, add_department))
        g.add((pr.Department, RDF.type, pr["d" + str(d)]))
        d_list.append(d)
    
    #Converting knowledge graph to text -----------------------------------------
    kg_list = []
    
    for s,p,o in g:
        gr = str(s) + "--" + str(p) + "--" + str(o) + "\n"
        kg_list.append(gr)
    
    if len(m_list) > 0:
        for i in range(1,max(m_list)+1):
            master_list.append("<b>PERSON " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "p"+str(i) in kg:
                    master_list.append(kg)

    if len(s_list) > 0:
        for i in range(1,max(s_list)+1):
            master_list.append("<b>CITY SOURCE " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
                            
            for kg in kg_list:
                if "city"+str(i) in kg:
                    master_list.append(kg)
    
    if len(s_list) > 0:
        for i in range(1,max(s_list)+1):
            master_list.append("<b>COUNTRY SOURCE " + str(i) + "</b>--<b>PREDICATE</b>--<b>WEB SOURCE</b>\n")    
            for kg in kg_list:
                if "country"+str(i) in kg:
                    master_list.append(kg)
                    
    if len(s_list) > 0:
        for i in range(1,max(s_list)+1):
            master_list.append("<b>WEB SOURCE " + str(i) + "</b>--<b>PREDICATE</b>--<b>WEB SOURCE</b>\n")        
            for kg in kg_list:
                if "s"+str(i) in kg:
                    master_list.append(kg)
    
    if len(d_list) > 0:
        for i in range(1,max(d_list)+1):
            master_list.append("<b>DEPARTMENT " + str(i) + "</b>--<b>PREDCIATE</b>--<b>ORGANIZATION</b>\n")
            for kg in kg_list:
                if "d"+str(i) in kg:
                    master_list.append(kg)

    gr = ''.join(master_list)
    print(gr)
    
    createTable(gr)

# Create HTML Tables

In [6]:
def createTable(graphtotext):

    def wrap(a, tag):
        "Wraps in <td> tag the a"
        tag1 = tag
        if tag == "table":
            tag1 = "table border = 2"
        if tag == "td" and a.strip().replace(".", "").isdigit():
            tag1 = "td style=\"text-align:right\""
        return f"<{tag1}>{a}</{tag}>"

    def split(tab):
        tab = tab.split("\n")
        for n, row in enumerate(tab):
            tab[n] = row.split("--")
        return tab

    def table(tab):
        html = ''  # contain html
        for n, x in enumerate(tab):
            for a in x:
                html += wrap(a, "td")
            html += "<tr>"
        html = wrap(html, "table")
        return html

    data = table(split(graphtotext)[1:-1])

    with open("uoftknowledgegraph.html", "w", encoding = "utf-8") as filehtml:
        filehtml.write(data)

    os.system("uoftknowledgegraph.html")
#     convertCodes()

def createExtendedTable(graphtotext):

    def wrap(a, tag):
        "Wraps in <td> tag the a"
        tag1 = tag
        if tag == "table":
            tag1 = "table border = 2"
        if tag == "td" and a.strip().replace(".", "").isdigit():
            tag1 = "td style=\"text-align:right\""
        return f"<{tag1}>{a}</{tag}>"

    def split(tab):
        tab = tab.split("\n")
        for n, row in enumerate(tab):
            tab[n] = row.split("--")
        return tab

    def table(tab):
        html = ''  # contain html
        for n, x in enumerate(tab):
            for a in x:
                html += wrap(a, "td")
            html += "<tr>"
        html = wrap(html, "table")
        return html

    data = table(split(graphtotext)[1:-1])

    with open("uoftknowledgegraphextended.html", "w", encoding = "utf-8") as filehtml:
        filehtml.write(data)

    os.system("uoftknowledgegraphextended.html")

# WikiData Defined Queries
- Defined queries for cities, countries, sources, persons, departments and topics.

In [26]:
def personQuery(x,y,g_expand):
    from SPARQLWrapper import SPARQLWrapper, JSON
    import json
    if x == "Q4662529":
        x = "Q29921988"
    for p in ["P21","P106","P69","P569","P31","P27"]:
        endpoint_url = "https://query.wikidata.org/sparql"
        
        if p == "P21":
            pl = "sex"
        elif p == "P106":
            pl = "occupation"
        elif p == "P69":
            pl = "educatedAt"
        elif p == "P569":
            pl = "dateOfBirth"
        elif p == "P27":
            pl = "citizenship"
        else:
            pl = "instanceOf"
        
        query = ("#Continents, countries, regions and capitals\n"
        "#defaultView:Tree\n"
        "SELECT ?valLabel ?val2Label\n"
        "WHERE\n" 
        "{\n"         
        "wd:"+x+" wdt:"+p+" ?val.\n"
        "SERVICE wikibase:label {\n"
        'bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".'
        "}"
        "}")

        sparql = SPARQLWrapper(endpoint_url, agent = "ChicoBot Test agent")
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        continent_results = sparql.query().convert()

        result = str(continent_results["results"]["bindings"])
        result = str(result[result.find("'value': '")+len("'value': '"):result.rfind("'")])

        if result != "":
            print(result)
        else:
            result = '<font color="red"><i>MISSING DATA</i></font>'

        for s,p,o in g:
            if y in o:
                g_expand.add((Literal(s),sch.person,Literal(y)))
                g_expand.add((Literal(s),wdt[pl],Literal(result)))
    return g_expand


def cityQuery(x,y,g_expand):
    from SPARQLWrapper import SPARQLWrapper, JSON, CSV
    import json
    
    for p in ["P1376","P1082","P6","P31","P138"]:
        endpoint_url = "https://query.wikidata.org/sparql"
        
        if p == "P1376":
            pl = "capitalOf"
        elif p == "P1082":
            pl = "population"
        elif p == "P31":
            pl = "instanceOf"
        elif p == "P138":
            pl = "namedAfter"
        else:
            pl = "cityMayor"
        
        query = ("#Continents, countries, regions and capitals\n"
        "#defaultView:Tree\n"
        "SELECT ?valLabel ?val2Label\n"
        "WHERE\n" 
        "{\n"
        "wd:"+x+" wdt:"+p+" ?val.\n"
        "SERVICE wikibase:label {\n"
        'bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".'
        "}"
        "}")

        sparql = SPARQLWrapper(endpoint_url, agent = "ChicoBot Test agent")
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        continent_results = sparql.query().convert()

        result = str(continent_results["results"]["bindings"])
        result = str(result[result.find("'value': '")+len("'value': '"):result.rfind("'")])

        if result != "":
            print(result)
        else:
            result = '<font color="red"><i>MISSING DATA</i></font>'

        for s,p,o in g:
            if y in o:
                g_expand.add((Literal(s),sch.inCity,Literal(y)))
                g_expand.add((Literal(s),wdt[pl],Literal(result)))
    return g_expand


def countryQuery(x,y,g_expand):
    from SPARQLWrapper import SPARQLWrapper, JSON, CSV
    import json
    
    for p in ["P30","P36","P571","P31","P2250"]:
        endpoint_url = "https://query.wikidata.org/sparql"
        
        if p == "P30":
            pl = "continent"
        elif p == "P36":
            pl = "capitalCity"
        elif p == "P31":
            pl = "instanceOf"
        elif p == "P2250":
            pl = "lifeExpectancy"
        else:
            pl = "inceptionDate"

        query = ("#Continents, countries, regions and capitals\n"
        "#defaultView:Tree\n"
        "SELECT ?valLabel ?val2Label\n"
        "WHERE\n" 
        "{\n"
        "wd:"+x+" wdt:"+p+" ?val.\n"
        "SERVICE wikibase:label {\n"
        'bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".'
        "}"
        "}")

        sparql = SPARQLWrapper(endpoint_url, agent = "ChicoBot Test agent")
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        continent_results = sparql.query().convert()

        result = str(continent_results["results"]["bindings"])
        result = str(result[result.find("'value': '")+len("'value': '"):result.rfind("'")])

        if result != "":
            print(result)
        else:
            result = '<font color="red"><i>MISSING DATA</i></font>'

        for s,p,o in g:
            if y in o:
                g_expand.add((Literal(s),sch.countryOfOrigin,Literal(y)))
                g_expand.add((Literal(s),wdt[pl],Literal(result)))
    return g_expand


def sourceQuery(x,y,g_expand):
    from SPARQLWrapper import SPARQLWrapper, JSON, CSV
    import json
    
    for p in ["P159","P31","P452","P112","P856"]:
        endpoint_url = "https://query.wikidata.org/sparql"
        
        if p == "P159":
            pl = "headquartersLocatedIn"
        elif p == "P31":
            pl = "instanceOf"
        elif p == "P112":
            pl = "foundedBy"
        elif p == "P856":
            pl = "officialWebsite"
        else:
            pl = "industryType"

        query = ("#Continents, countries, regions and capitals\n"
        "#defaultView:Tree\n"
        "SELECT ?valLabel\n"
        "WHERE\n" 
        "{\n"
        "wd:"+x+" wdt:"+p+" ?val.\n"
        "SERVICE wikibase:label {\n"
        'bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".'
        "}"
        "}")

        sparql = SPARQLWrapper(endpoint_url, agent = "ChicoBot Test agent")
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        continent_results = sparql.query().convert()

        result = str(continent_results["results"]["bindings"])
        result = str(result[result.find("'value': '")+len("'value': '"):result.rfind("'")])

        if result != "":
            print(result)
        else:
            result = '<font color="red"><i>MISSING DATA</i></font>'

        for s,p,o in g:
            if y in o:
                g_expand.add((Literal(s),sch.name,Literal(y)))
                g_expand.add((Literal(s),wdt[pl],Literal(result)))
    return g_expand


def departmentQuery(x,y,g_expand):
    from SPARQLWrapper import SPARQLWrapper, JSON, CSV
    import json
    
    for p in ["P279","P3984","P1365","P2579","P3095"]:
        endpoint_url = "https://query.wikidata.org/sparql"
        
        if p == "P279":
            pl = "subClassOf"
        elif p == "P1365":
            pl = "replaces"
        elif p == "P2579":
            pl = "studiedBy"
        elif p == "P3095":
            pl = "practicedBy"
        else:
            pl = "subredditName"

        query = ("#Continents, countries, regions and capitals\n"
        "#defaultView:Tree\n"
        "SELECT ?valLabel\n"
        "WHERE\n" 
        "{\n"
        "wd:"+x+" wdt:"+p+" ?val.\n"
        "SERVICE wikibase:label {\n"
        'bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".'
        "}"
        "}")

        sparql = SPARQLWrapper(endpoint_url, agent = "ChicoBot Test agent")
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        continent_results = sparql.query().convert()

        result = str(continent_results["results"]["bindings"])
        result = str(result[result.find("'value': '")+len("'value': '"):result.rfind("'")])
        
        if result != "":
            print(result)
        else:
            result = '<font color="red"><i>MISSING DATA</i></font>'

        for s,p,o in g:
            if y in o:
                g_expand.add((Literal(s),sch.department,Literal(y)))
                g_expand.add((Literal(s),sch[pl],Literal(result)))
    return g_expand

def topicQuery(x,y,g_expand):
    from SPARQLWrapper import SPARQLWrapper, JSON, CSV
    import json
    
    for p in ["P1476","P921","2093","P577", "P1433"]:
        endpoint_url = "https://query.wikidata.org/sparql"
        
        if p == "P1476":
            pl = "title"
        if p == "P921":
            pl = "mainSubject"
        if p == "P2093":
            pl = "authorOfMainSubject"
        if p == "P577":
            pl = "publicationDate"  
        else:
            pl = "publishedIn"

        query = ("#Continents, countries, regions and capitals\n"
        "#defaultView:Tree\n"
        "SELECT ?valLabel\n"
        "WHERE\n" 
        "{\n"
        "wd:"+x+" wdt:"+p+" ?val.\n"
        "SERVICE wikibase:label {\n"
        'bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".'
        "}"
        "}")

        sparql = SPARQLWrapper(endpoint_url, agent = "ChicoBot Test agent")
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        continent_results = sparql.query().convert()

        result = str(continent_results["results"]["bindings"])
        result = str(result[result.find("'value': '")+len("'value': '"):result.rfind("'")])
        
        if result != "":
            print(result)
        else:
            result = '<font color="red"><i>MISSING DATA</i></font>'

            g_expand.add((pr.t1,sch[pl],Literal(result)))
    
    return g_expand


# Convert WikiData Codes

In [27]:
from requests import get

instance_uri_list = []
instance_list = []
source.append("CBC News")

instances = [list(set(person)),list(set(city)),list(set(country)),list(set(source)),list(set(department)),list(set(topic_string))]

#Get Topic
#Was not working, so hard coded.
g_expand = Graph()
x = "Q30987748"
a = "microrobotics"
print(f"{x} {a}")
topicQuery(a,x,g_expand)

for instance in instances:
    for x in instance:
        try:
            resp = get('https://www.wikidata.org/w/api.php', {
                'action': 'wbgetentities',
                'titles': x,
                'sites': 'enwiki',
                'props': '',
                'format': 'json'
            }).json()

            count = 0
            a = list(resp['entities'])[0]

            if instance == list(set(country)) and "Q" in a:
                print(f"{a} {x}")
                countryQuery(a,x,g_expand)
                count += 1

            if instance == list(set(person)) and "Q" in a:
                print(f"{a} {x}")
                personQuery(a,x,g_expand)
                count += 1

            if instance == list(set(department)) and "Q" in a:
                print(f"{a} {x}")
                departmentQuery(a,x,g_expand)
                count += 1

            if instance == list(set(source)) and "Q" in a:
                print(f"{a} {x}")
                sourceQuery(a,x,g_expand)
                count += 1

            if instance == list(set(city)) and "Q" in a:
                print(f"{a} {x}")
                cityQuery(a,x,g_expand)
                count += 1
                
        except:
            pass

#Converting knowledge graph to text ---------------------------------------------------------------------------------------
kg_list = []
master_list = ["MENTION--TYPE--CLASS\n"]

for s,p,o in g_expand:
    gr = str(s) + "--" + str(p) + "--" + str(o) + "\n"
    kg_list.append(gr)

if len(m_list) > 0:
    for i in range(1,max(m_list)+1):
        if "p" + str(i) in str(kg_list):
            master_list.append("<b>PERSON MENTION " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "p"+str(i) in kg:
                    master_list.append(kg)

if len(s_list) > 0:
    for i in range(1,max(s_list)+1):
        if "s" + str(i) in str(kg_list):
            master_list.append("<b>WEB SOURCE " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "s"+str(i) in kg:
                    master_list.append(kg)
                
if len(s_list) > 0:
    for i in range(1,max(s_list)+1):
        if "country" + str(i) in str(kg_list):
            master_list.append("<b>COUNTRY SOURCE " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "country"+str(i) in kg:
                    master_list.append(kg)
                
if len(s_list) > 0:
    for i in range(1,max(s_list)+1):
        if "city" + str(i) in str(kg_list):
            master_list.append("<b>CITY SOURCE " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "city"+str(i) in kg:
                    master_list.append(kg)

if len(d_list) > 0:
    for i in range(1,max(d_list)+1):
        if "d" + str(i) in str(kg_list):
            master_list.append("<b>DEPARTMENT " + str(i) + "</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "d"+str(i) in kg:
                    master_list.append(kg)

if "t1" in str(kg_list):
            master_list.append("<b>TOPIC 1</b>--<b>PREDICATE</b>--<b>OBJECT</b>\n")
            for kg in kg_list:
                if "t1" in kg:
                    master_list.append(kg)

gr = ''.join(master_list)
print(gr)

createExtendedTable(gr)

Q30987748 microrobotics
Q14949557 Mark S. Fox
male
computer scientist
University of Toronto
1952-05-09T00:00:00Z
human
Canada
Q4662529 Aaron Wheeler
male
chemist'}}, {'valLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'researcher
human
Q172 Toronto
Ontario
2731571
John Tory
city'}}, {'valLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'single-tier municipality'}}, {'valLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'provincial or territorial capital city in Canada
Fort Rouillé
Q16 Canada
North America
Ottawa
1867-07-01T00:00:00Z
country'}}, {'valLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'sovereign state
82.30051
Q2931014 CBC News
CBC Ottawa Broadcast Centre
business
mass media
Canadian Broadcasting Corporation
http://www.cbc.ca/news/
Q2329 Chemistry
physical science
chemistry
alchemy
engineering
chemist
Q21198 Computer science
formal science
compsci
computer scientist'}}, {'valLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'IT Instructor
MENT

# Main Function

- Run this to enter URL's into prompt.
- Webscrapes article information and additional data.
- Primary program for Assignment 1 to call all relevant functions.

In [28]:
# try:    

m_list = []
s_list = []
d_list = []

# Prompts user to enter web article
press_article = input("Please copy and paste desired news article here: ")

g = Graph()
nlp = sp.load("en_core_web_sm")

#Website to be parsed
web_url = (press_article)

#Webscraping Title of Article
ttl_data = urllib.request.urlopen(web_url).read()
ttl_soup = BeautifulSoup(ttl_data,'html.parser')
ttl_text = str(ttl_soup.find_all('title'))

#Webscraping Article URL
data = urllib.request.urlopen(web_url).read()
soup = BeautifulSoup(data,'html.parser')
text = str(soup.find_all('p'))

#Webscraping Department URL
#Checks to see if department is listed at UofT
#Website may be temporarily down
dep_url = "https://www.utoronto.ca/a-to-z-directory"
dep_data = urllib.request.urlopen(dep_url).read()
dep_soup = BeautifulSoup(dep_data,'html.parser')
dep_text = str(dep_soup.find_all('a'))

# Webscraping Countries URL
# If GPE label in entity, check to see if label is country
cty_url = "https://www.worldometers.info/geography/alphabetical-list-of-countries/"
cty_data = urllib.request.urlopen(cty_url).read()
cty_soup = BeautifulSoup(cty_data,'html.parser')
cty_text = str(cty_soup.find_all('td'))

text.split("</p>")
count = 0
n = 0
m = "m"

#A list of words that should be neglected from function departmentSearch and looked up in the article
null_words = ["UofT","uoft","U Of T", "U of T","The University of Toronto",
              "the University of Toronto", "University of Toronto","university of toronto", "utoronto"]

#A list of months
date_words = ["January","February","March","April","May","June","July","August","September","October",
                                                                                 "November","December"]

#A list of numbers that may appear in a string
num_words = ["1","2","3","4","5","6","7","8","9"]

#Create list of variables that will be appended in each iteration
person = []
city = []
country = []
datetime = []
department = []
source = []

if any(k in text for k in null_words):

    # Extracts title (Topic String)
    topic_string = ttl_text.split('<title>')[-1].split('</title>')[0]
    print(f"\nTitle/ Topic: {topic_string}")

    # Extracts Webpage Source
    source.append(web_url.split('//')[-1].split('/')[0])
    print (f"\nWeb Source: {source}")

    #Checks to see if UofT is mentioned in title or web URL.
    #This will indicate if entire article is relevant or not
    if any(k in topic_string for k in null_words) or any(k in web_url for k in null_words):

        for token in nlp(text).ents:
            facultySearch(token) #Person
            dateSearch(token) # Date
            citySearch(token) # City
            countrySearch(token) # Country
            departmentSearch(token)  # Department

    #If uoft is not mentioned in title or url, it will check each paragraph to see if a mention can be found.
    for line in text.split("</p>"):
        for token in nlp(line).ents:
            if any(k in line for k in null_words):
                facultySearch(token) #Person
                dateSearch(token) # Date
                citySearch(token) # City
                countrySearch(token) # Country
                departmentSearch(token)  # Department

    print("\nPrinting Data to be added to Knowledgegraph ------------------------------------\n")

    #Prints and summarizes data in a formatted list to make it easier to read
    if not person:
        print()
    else:
        print(f"Person: {set(person)}")

    if not datetime:
        print()
    else:
        print(f"Date: {set(datetime)}")
    if not city:
        print()
    else:
        print(f"City Name: {set(city)}")
    if not country:
        print()
    else:
        print(f"Country Name: {set(country)}")
    if not department:
        print()
    else:
        print(f"Department Name: {set(department)}")

    print("\nPrinting Knowledgegraph --------------------------------------------------------\n")
    department = [dept.capitalize() for dept in department]
    addObject()

else:
    #Print if UofT mention cannot be found in article
    print("No mention of UofT")
# except:
#     print("Unrecognized URL")

Please copy and paste desired news article here: https://news.engineering.utoronto.ca/microrobots-to-change-the-way-we-work-with-cellular-material/

Title/ Topic: Microrobots to change the way we work with cellular material - U of T Engineering News

Web Source: ['news.engineering.utoronto.ca']

Printing Data to be added to Knowledgegraph ------------------------------------

Person: {'Dan Haves', 'Shuailong Zhang', 'Mark S. Fox', 'Aaron Wheeler'}

City Name: {'Toronto'}
Country Name: {'Canada'}
Department Name: {'Chemistry', 'Computer science'}

Printing Knowledgegraph --------------------------------------------------------

MENTION--TYPE--CLASS
<b>MENTION</b>--<b>TYPE</b>--<b>CLASS</b>
http://ontology.eil.utoronto.ca/MIE1501/publicity.owl#Source--http://ontology.eil.utoronto.ca/MIE1501/publicity.owl#inCity--http://schema.org/City
http://ontology.eil.utoronto.ca/MIE1501/publicity.owl#Source--http://schema.org/countryOfOrigin--http://schema.org/Country
http://ontology.eil.utoronto.ca/

In [29]:
g_a2 = Graph()
g_a2 = (g + g_expand)
g_a2.serialize(destination = "Assignment 2 Triples", format = 'turtle')