In [1]:
# Define Neo4j connections
import pandas as pd
from neo4j import GraphDatabase
host = 'bolt://3.83.239.168:7687'
user = 'neo4j'
password = 'swim-ram-percents'
driver = GraphDatabase.driver(host,auth=(user, password))

def run_query(query, params={}):
    with driver.session() as session:
        result = session.run(query, params)
        return pd.DataFrame([r.values() for r in result], columns=result.keys())

In [2]:
run_query("""
CREATE CONSTRAINT n10s_unique_uri IF NOT EXISTS ON (r:Resource)
ASSERT r.uri IS UNIQUE
""")

In [3]:
run_query("""
CALL n10s.graphconfig.init({
  handleVocabUris: 'MAP',
  applyNeo4jNaming: true
})
""")

Unnamed: 0,param,value
0,handleVocabUris,MAP
1,handleMultival,OVERWRITE
2,handleRDFTypes,LABELS
3,keepLangTag,False
4,keepCustomDataTypes,False
5,applyNeo4jNaming,True
6,baseSchemaNamespace,neo4j://graph.schema#
7,baseSchemaPrefix,n4sch
8,classLabel,Class
9,subClassOfRel,SCO


In [4]:
import requests
from requests.structures import CaseInsensitiveDict

# Query to import RDF/XML data to Neo4j using Neosemantics
import_rdf_query = """
UNWIND $data AS link
CALL n10s.rdf.import.fetch(
  link,
  'RDF/XML'
) YIELD triplesLoaded
RETURN sum(triplesLoaded) AS totalTriplesLoaded
"""

def make_request(uri):
    # For some reason, the API only works when I pretend to be a browser
    headers = CaseInsensitiveDict()
    headers["user-agent"] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36"
    return requests.get(uri, headers=headers)
    
def ukgazzette_to_neo4j(pages=1, categorycode="", noticetype=""):
    for page in range(1, pages + 1):
        baseUrl = f"https://www.thegazette.co.uk/all-notices/notice/data.json"
        ccode = "categorycode=" + categorycode + "&" if categorycode else ""
        ntype = "noticetype=" + noticetype + "&" if noticetype else ""
        parameters = f"?{ccode}{ntype}results-page-size=100&sort-by=latest-date&results-page={page}"
        
        try:
            response = make_request(baseUrl + parameters)
            responseJson = response.json()
        except Exception as e:
            print(response.text)
            print(e)
            break

        # Define RDF/XML URL links
        data = []
        for notice in responseJson['entry']:
            id = notice['id'].split('/')[-1]
            rdf_uri = f"https://www.thegazette.co.uk/notice/{id}/data.rdf?view=linked-data"
            data.append(rdf_uri)

        # Import RDF into Neo4j with Neosemantics
        query_response = run_query(import_rdf_query, {'data': data})
        print(query_response)

In [5]:
# Import last 1000 state notices
ukgazzette_to_neo4j(10, "11")

   totalTriplesLoaded
0                3519
   totalTriplesLoaded
0                4918
   totalTriplesLoaded
0                4953
   totalTriplesLoaded
0                5008
   totalTriplesLoaded
0                5110
   totalTriplesLoaded
0                4997
   totalTriplesLoaded
0                4899
   totalTriplesLoaded
0                4932
   totalTriplesLoaded
0                4928
   totalTriplesLoaded
0                4875


In [6]:
# Who has received any awards

run_query("""
MATCH (award)<-[:ISAWARDED]-(t:AwardandHonourThing)-[:HASAWARDEE]->(person)-[:HASEMPLOYMENT]->(employment)-[:ISMEMBEROFORGANISATION]->(organization)
RETURN award.label AS award,
       t.relatedDate AS relatedDate,
       person.name AS person,
       employment.jobTitle AS jobTitle,
       organization.name AS organization
ORDER BY relatedDate DESC
LIMIT 5
""")


Unnamed: 0,award,relatedDate,person,jobTitle,organization
0,B.E.M.,2021-12-31,Darren Peter CLARK,"First Secretary, Operational Delivery Manager",British Embassy Yangon
1,B.E.M.,2021-12-31,Brian WARING,Desk Officer,"Foreign, Commonwealth and Development Office"
2,B.E.M.,2021-12-31,Bryn Owen WILLIAMS,Desk Officer,"Foreign, Commonwealth and Development Office"
3,B.E.M.,2021-12-31,Natalie Claire COLEMAN,Director,National Gallery of the Cayman Islands
4,B.E.M.,2021-12-31,Patricia Susan OWENS,Corporate Services Manager,Miami Consulate General


In [7]:
run_query("""
MATCH (n:CommanderOrderOfTheBritishEmpire)<-[:ISAPPOINTEDAS]-(notice)-[:HASAPPOINTEE]->(appointee),
      (notice)-[:HASAUTHORITY]->(authority)
RETURN n.label AS award,
       notice.relatedDate AS date,
       appointee.name AS appointee,
       authority.label AS authority
ORDER BY date DESC
LIMIT 5
""")

Unnamed: 0,award,date,appointee,authority
0,C.B.E.,2021-12-31,Bernard John TAUPIN,Central Chancery of the Orders of Knighthood
1,C.B.E.,2021-12-31,Robert Adrian STRINGER,Central Chancery of the Orders of Knighthood
2,C.B.E.,2021-12-22,Dr. Kai Hung LEE,Central Chancery of the Orders of Knighthood


In [8]:
run_query("""
MATCH (provenance)<-[:HAS_PROVENANCE]-(n:Notice)-[:ISABOUT]->(l:Legislation:NotifiableThing)-[:RELATEDLEGISLATION]->(related)
RETURN n.hasNoticeID AS noticeID,
       n.uri AS noticeURI,
       l.relatedDate AS date,
       provenance.uri AS provenance,
       collect(related.label) AS relatedLegislations
ORDER BY date DESC
LIMIT 5
""")

Unnamed: 0,noticeID,noticeURI,date,provenance,relatedLegislations
0,3999024,https://www.thegazette.co.uk/id/notice/3999024,2022-02-21,https://www.thegazette.co.uk/id/notice/3999024...,[Universities of Oxford and Cambridge Act 1923]
1,3999023,https://www.thegazette.co.uk/id/notice/3999023,2022-02-21,https://www.thegazette.co.uk/id/notice/3999023...,[Universities of Oxford and Cambridge Act 1923]
2,3992302,https://www.thegazette.co.uk/id/notice/3992302,2022-02-14,https://www.thegazette.co.uk/id/notice/3992302...,[BURIAL ACT 1853]
3,3991643,https://www.thegazette.co.uk/id/notice/3991643,2022-02-14,https://www.thegazette.co.uk/id/notice/3991643...,[BURIAL ACT 1853]


In [9]:
# Redemption or purchase of own shares out of capital, Company director disqualification order, Dissolution of partnership
ukgazzette_to_neo4j(10, "26+27", "2602+2608+2702")

   totalTriplesLoaded
0                3998
   totalTriplesLoaded
0                4030
   totalTriplesLoaded
0                3894
   totalTriplesLoaded
0                3644
   totalTriplesLoaded
0                3300
   totalTriplesLoaded
0                4050
   totalTriplesLoaded
0                3905
   totalTriplesLoaded
0                4029
   totalTriplesLoaded
0                4228
   totalTriplesLoaded
0                4144


In [10]:
run_query("""
MATCH (n:PartnershipDissolutionNotice)-[:ISABOUT]->(notifiableThing)-[:HASCOMPANY]->(partnership),
      (notifiableThing)-[:ISENABLEDBYLEGISLATION]->(enabledby)
RETURN n.hasNoticeID AS noticeID,
       notifiableThing.relatedDate AS date,
       notifiableThing.uri AS noticeURI,
       enabledby.label AS enablingLegislation,
       partnership.name AS partnership
ORDER BY date DESC
LIMIT 5
""")

Unnamed: 0,noticeID,date,noticeURI,enablingLegislation,partnership
0,4001999,2022-02-25,https://www.thegazette.co.uk/id/notice/4001999...,LIMITED PARTNERSHIPS ACT 1907,PRAMERICA REAL ESTATE CAPITAL I (SCOTLAND) LIM...
1,3996994,2022-02-18,https://www.thegazette.co.uk/id/notice/3996994...,PARTNERSHIP ACT 1890,\n W H MAYES PARTNERSHIP
2,3998712,2022-02-16,https://www.thegazette.co.uk/id/notice/3998712...,LIMITED PARTNERSHIPS ACT 1907 & PARTNERSHIP AC...,\n Edammer Limited Partnership
3,3991200,2022-02-15,https://www.thegazette.co.uk/id/notice/3991200...,LIMITED PARTNERSHIPS ACT 1907,17 CAPITAL MEZZANINE CO-INVEST LP
4,3991197,2022-02-11,https://www.thegazette.co.uk/id/notice/3991197...,LIMITED PARTNERSHIPS ACT 1907,17CAPITAL (OLYMPUS) LP


In [11]:
run_query("""
MATCH (legislation)<-[:RELATEDLEGISLATION]-(n:RedemptionOrPurchase)-[:HASCOMPANY]->(company)
RETURN n.relatedDate AS date,
       company.name AS company,
       company.uri AS companyURI,
       collect(legislation.label) AS relatedLegislations,
       n.uri AS noticeURI
ORDER BY date DESC
LIMIT 5
""")

Unnamed: 0,date,company,companyURI,relatedLegislations,noticeURI
0,2022-02-15,G. & B. (NORTH WEST) LIMITED,http://business.data.gov.uk/id/company/01797547,"[Companies Act 2006, Companies Act 2006, s. 719]",https://www.thegazette.co.uk/id/notice/3997065...
1,2022-02-01,\n RAS CAPITAL NO 1 LIMITED,http://business.data.gov.uk/id/company/10153195,"[Companies Act 2006, Companies Act 2006, s. 719]",https://www.thegazette.co.uk/id/notice/3981164...
2,2022-01-07,ROOSTEN LIMITED,http://business.data.gov.uk/id/company/08123072,"[Companies Act 2006, Companies Act 2006, s. 719]",https://www.thegazette.co.uk/id/notice/3969127...
3,2022-01-07,WOODMAN MOTOR COMPANY LIMITED,http://business.data.gov.uk/id/company/06453796,"[Companies Act 2006, s. 714, Companies Act 200...",https://www.thegazette.co.uk/id/notice/3965706...
4,2021-12-27,APPLABS TECHNOLOGIES (UK) PVT LIMITED,https://www.thegazette.co.uk/id/notice/3961668...,"[Companies Act 2006, s. 721, Companies Act 200...",https://www.thegazette.co.uk/id/notice/3961668...


In [12]:
run_query("""
MATCH (notice)-[:ISABOUT]->(n:PartnershipChangeInMembers)-[:HASCOMPANY]->(company)
RETURN notice.hasNoticeID AS noticeID,
       notice.uri AS noticeURI,
       n.relatedDate AS date,
       company.name AS company
ORDER BY date DESC
LIMIT 5
""")

Unnamed: 0,noticeID,noticeURI,date,company
0,3984531,https://www.thegazette.co.uk/id/notice/3984531,2022-02-03,WALES FAMILY PARTNERSHIP
1,3981940,https://www.thegazette.co.uk/id/notice/3981940,2022-02-01,INVERGORDON D SCOTTISH LIMITED PARTNERSHIP
2,3985272,https://www.thegazette.co.uk/id/notice/3985272,2022-01-31,Gilberts Chartered Accountants
3,3970041,https://www.thegazette.co.uk/id/notice/3970041,2022-01-14,\n KINGSWOOD SURGERY TUNBRIDGEWELLS
4,3964974,https://www.thegazette.co.uk/id/notice/3964974,2022-01-05,Thornton & Wright Opticians


In [21]:
from bs4 import BeautifulSoup as bs
import spacy

nlp = spacy.load("en_core_web_sm")

def extract_entities(noticeId):
    print(f"\nExtracting entities for {noticeId}")
    uri = f"https://www.thegazette.co.uk/notice/{noticeId}/data.xml?download=true"
    content = make_request(uri).content
    bs_content = bs(content, "lxml")
    text = " ".join([el.text for el in bs_content.findAll("p", {"data-gazettes":"Text"})])
    print(text)
    doc = nlp(text)
    # Find named entities, phrases and concepts
    print('Entities \n --------------------')
    for entity in doc.ents:
        if not entity.label_ in ['PERSON', 'ORG']:
            continue
        print(entity.text, entity.label_)


In [22]:
partnership_changes = run_query("""
MATCH (notice)-[:ISABOUT]->(n:PartnershipChangeInMembers)
RETURN notice.hasNoticeID AS noticeID
LIMIT 5
""")['noticeID'].to_list()

for i in partnership_changes:
    extract_entities(i)


Extracting entities for 3996989

Pursuant to section 10 of the Limited Partnerships Act 1907, notice is hereby given in respect of IIF UK 1 LP, a limited partnership registered in England with registered number LP012764 (the “Partnership”), that:

 1.	FCA Pension Plan Trustee Limited as trustee of the FCA Pension Plan was admitted as a new limited partner of the Partnership.

          
Entities 
 --------------------
IIF UK 1 LP ORG
LP012764 ORG
the FCA Pension Plan ORG

Extracting entities for 3989075

Notice is hereby given that Dr Stephen Kirkham, Dr Brij Patel and Dr John Hampson ceased to be a partners at Tower Family Healthcare, 16 Market St, Tottington, Bury, BL8 4AD. Dr Stephen Kirkham with effect from the 31st December 2021, Dr Brij Patel with effect from the 17th December 2021 and Dr John Hampson with effect from the 30th June 2021 . The business will continue with the remaining partners

Entities 
 --------------------
Stephen Kirkham PERSON
Dr Brij Patel PERSON
John Hamps