In [1]:
# https://rebeccabilbro.github.io/sparql-from-python/
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

question_df = pd.DataFrame()
sparql  = SPARQLWrapper("https://api.parliament.uk/sparql/")
sparql.setQuery("""
SELECT *
WHERE {
?question <https://id.parliament.uk/schema/writtenQuestionIndexingAndSearchUin> ?qnum .
?person <https://id.parliament.uk/schema/askingPersonHasQuestion> ?question .
?person <https://id.parliament.uk/schema/wikidataThingHasEquivalentWikidataResource> ?wikidataperson.
?question <https://id.parliament.uk/schema/questionText> ?text .
?question <https://id.parliament.uk/schema/questionAskedAt> ?date .
FILTER (?date >= "2023-01-01+00:00"^^xsd:dateTime && ?date < "2023-10-01+00:00"^^xsd:dateTime)
FILTER regex(?qnum, "^(?!HL)") 
}
""")

sparql.setReturnFormat(JSON)
result = sparql.query().convert()
# results -> bindings returns a list of dictionaries
result = result["results"]["bindings"]

# removed type from the results
for item in result:
    for key in item:
        item[key] = item[key]["value"]


In [2]:
question_df = pd.DataFrame(result)

question_df["text"] = question_df["text"].str.replace("<p>", "", )
question_df["text"] = question_df["text"].str.replace("</p>", "", )

question_df

Unnamed: 0,question,qnum,person,wikidataperson,text,date
0,https://id.parliament.uk/0IwOO533,903422,https://id.parliament.uk/Vs3bGLNz,http://www.wikidata.org/entity/Q689287,Which three departments had the lowest proport...,2023-01-05+00:00
1,https://id.parliament.uk/dRJ9254g,117748,https://id.parliament.uk/eD0yd5Ec,http://www.wikidata.org/entity/Q19871931,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00
2,https://id.parliament.uk/e4IDRkKi,117410,https://id.parliament.uk/RlIqlixq,http://www.wikidata.org/entity/Q695228,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00
3,https://id.parliament.uk/rXZ0r1JW,117477,https://id.parliament.uk/1DzMUkBy,http://www.wikidata.org/entity/Q337812,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00
4,https://id.parliament.uk/SmC4cGhg,117777,https://id.parliament.uk/tOfis7j9,http://www.wikidata.org/entity/Q27942399,To ask the Secretary of State for Health and S...,2023-01-06+00:00
...,...,...,...,...,...,...
26860,https://id.parliament.uk/pkosM2MK,906563,https://id.parliament.uk/wSRcmnqE,http://www.wikidata.org/entity/Q20127903,What steps his Department is taking to reduce ...,2023-09-19+01:00
26861,https://id.parliament.uk/JmAZSx2y,906567,https://id.parliament.uk/Vs3bGLNz,http://www.wikidata.org/entity/Q689287,If he will take steps to expedite the approval...,2023-09-19+01:00
26862,https://id.parliament.uk/dPiUF1wi,906604,https://id.parliament.uk/Fx1EcmX5,http://www.wikidata.org/entity/Q304027,What discussions he has had with the Welsh Gov...,2023-09-19+01:00
26863,https://id.parliament.uk/dhVF7qpN,906500,https://id.parliament.uk/iqJbTPtY,http://www.wikidata.org/entity/Q1681420,"To ask the Secretary of State for Environment,...",2023-09-19+01:00


In [3]:

district_df = pd.DataFrame()
sparql  = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT ?person ?district ?personLabel ?districtLabel
WHERE
{
  ?person p:P39 ?position.
  ?position ps:P39 wd:Q77685926.
  ?position pq:P768 ?district
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
""")

sparql.setReturnFormat(JSON)
result = sparql.query().convert()
# results -> bindings returns a list of dictionaries
result = result["results"]["bindings"]

# removed type from the results
for item in result:
    for key in item:
        item[key] = item[key]["value"]
district_df = pd.DataFrame(result)
district_df

Unnamed: 0,district,person,personLabel,districtLabel
0,http://www.wikidata.org/entity/Q1032152,http://www.wikidata.org/entity/Q259707,Stuart Andrew,Pudsey
1,http://www.wikidata.org/entity/Q3138501,http://www.wikidata.org/entity/Q261773,Rosie Winterton,Doncaster Central
2,http://www.wikidata.org/entity/Q613294,http://www.wikidata.org/entity/Q263076,Robin Walker,Worcester
3,http://www.wikidata.org/entity/Q1031940,http://www.wikidata.org/entity/Q263350,Jake Berry,Rossendale and Darwen
4,http://www.wikidata.org/entity/Q1031751,http://www.wikidata.org/entity/Q263508,Clive Betts,Sheffield South East
...,...,...,...,...
697,http://www.wikidata.org/entity/Q1031884,http://www.wikidata.org/entity/Q122451364,Michael Shanks,Rutherglen and Hamilton West
698,http://www.wikidata.org/entity/Q3337694,http://www.wikidata.org/entity/Q19871819,Lisa Cameron,"East Kilbride, Strathaven and Lesmahagow"
699,http://www.wikidata.org/entity/Q988057,http://www.wikidata.org/entity/Q123113780,Sarah Edwards,Tamworth
700,http://www.wikidata.org/entity/Q1072632,http://www.wikidata.org/entity/Q122848847,Alistair Luke Strathern,Mid Bedfordshire


In [4]:
# 650 seats in the house of commons, yet 669 unique people returned
print(district_df["person"].duplicated().sum())
# return a df of people who arew in duplicate_people and not in duplicate_districts

district_df[district_df["person"].duplicated()][["person"]].head()

33


Unnamed: 0,person
42,http://www.wikidata.org/entity/Q5162256
85,http://www.wikidata.org/entity/Q75818970
86,http://www.wikidata.org/entity/Q75818970
144,http://www.wikidata.org/entity/Q478161
164,http://www.wikidata.org/entity/Q5240825


In [5]:
# Of the people who appear more than once, do they have different districts in each occurence?
# This might cause problem if someone has multiple districts associated with them
district_df[district_df["person"].duplicated(keep=False)][~district_df["district"].duplicated(keep=False)]

  district_df[district_df["person"].duplicated(keep=False)][~district_df["district"].duplicated(keep=False)]


Unnamed: 0,district,person,personLabel,districtLabel


In [6]:
# all duplicated people have the same district in both. So we can remove duplicates of people
# duplicate districts are caused by multiple people being elected in the same district

district_df_cleaned = district_df.drop_duplicates()

# there a some duplicate districts still because of members being suspended and then a by-election being called


# only keep the people who are in the question_df
district_df_cleaned = district_df_cleaned[district_df_cleaned["person"].isin(question_df["wikidataperson"])]
district_df_cleaned

Unnamed: 0,district,person,personLabel,districtLabel
2,http://www.wikidata.org/entity/Q613294,http://www.wikidata.org/entity/Q263076,Robin Walker,Worcester
3,http://www.wikidata.org/entity/Q1031940,http://www.wikidata.org/entity/Q263350,Jake Berry,Rossendale and Darwen
4,http://www.wikidata.org/entity/Q1031751,http://www.wikidata.org/entity/Q263508,Clive Betts,Sheffield South East
5,http://www.wikidata.org/entity/Q1032065,http://www.wikidata.org/entity/Q264305,Crispin Blunt,Reigate
6,http://www.wikidata.org/entity/Q988407,http://www.wikidata.org/entity/Q264560,Karen Bradley,Staffordshire Moorlands
...,...,...,...,...
664,http://www.wikidata.org/entity/Q1070045,http://www.wikidata.org/entity/Q30164696,Christine Jardine,Edinburgh West
665,http://www.wikidata.org/entity/Q989007,http://www.wikidata.org/entity/Q30169287,Tanmanjeet Singh Dhesi,Slough
666,http://www.wikidata.org/entity/Q3137955,http://www.wikidata.org/entity/Q30174665,Darren Jones,Bristol North West
669,http://www.wikidata.org/entity/Q3336117,http://www.wikidata.org/entity/Q30234240,Ellie Reeves,Lewisham West and Penge


In [7]:
# Check if there are any people who have asked questions but are not in the district_df
question_df[~question_df["wikidataperson"].isin(district_df_cleaned["person"])]

Unnamed: 0,question,qnum,person,wikidataperson,text,date


In [58]:
import random
import spacy  # version 3.0.6'

# initialize language model
nlp = spacy.load("en_core_web_md")

# add pipeline (declared through entry_points in setup.py)
nlp.add_pipe("entityLinker", last=True)

# pick random number
x = random.randint(0, len(question_df))

print(question_df["text"].iloc[x])

doc = nlp(question_df["text"].iloc[x])
doc._.linkedEntities.print_super_entities()
# for sent in doc.sents:
#     sent._.linkedEntities.pretty_print()

To ask the Secretary of State for Health and Social Care, how many medically fit patients in acute wards are awaiting discharge in (a) Kettering and (b) Northampton General Hospital; and what proportion of the total available beds this constitutes in both locations.
public office (1) : United States Secretary of State
secretary of state (1) : United States Secretary of State
foreign minister (1) : United States Secretary of State
software (1) : State
musical group (1) : Health
assist (1) : home care
type of medical procedure (1) : home care
medical procedure (1) : home care
hospitalization (1) : home care
state (1) : patient


In [97]:
import requests
token = "5fcf52fb-79b2-4718-b7ee-62957df7d7e0-843339462"

def pick_random_question():
    x = random.randint(0, len(question_df))
    return question_df["text"].iloc[x]


def tag_me(text=None):
    if not text:
        text = pick_random_question()
        print(text)


    address = f"https://tagme.d4science.org/tagme/tag?lang=en&gcube-token={token}&include_categories=false&include_abstract=false&text={text}"
    response = requests.get(address)

    annotations = response.json()["annotations"]
    new_annotations = []

    for spot in annotations:
        # remove if rho is less than 0.1
        if spot["rho"] >= 0.1:
            new_annotations.append(spot)

    return new_annotations


def WAT(text=None):
    if not text:
        text = pick_random_question()
        print(text)
       
    address = f"https://wat.d4science.org/wat/tag/tag?lang=en&gcube-token={token}&include_categories=true&text={text}"
    response = requests.get(address)
    annotations = response.json()["annotations"]
    new_annotations = []

    for spot in annotations:
        # remove if rho is less than 0.1
        if spot["rho"] >= 0.1:
            new_annotations.append(spot)
    return new_annotations


def REL(text=None):
    if not text:
        text = pick_random_question()
        print(text)
    API_URL = "https://rel.cs.ru.nl/api"

    # Example ED.
    ed_result = requests.post(API_URL, json={
        "text": text,
    })
    assert ed_result.status_code == 200
    
    return ed_result.json()

In [98]:
tag_me()

To ask the Secretary of State for Foreign, Commonwealth and Development Affairs, what discussions has he had with international counterparts on (a) strengthening and (b) supporting the mandate of the UN Special Rapporteur on Human Rights in Myanmar.


[{'spot': 'Secretary of State',
  'start': 11,
  'link_probability': 0.15942679345607758,
  'rho': 0.2051485776901245,
  'end': 29,
  'id': 32293,
  'title': 'United States Secretary of State'},
 {'spot': 'Commonwealth',
  'start': 43,
  'link_probability': 0.12357430905103683,
  'rho': 0.17639176547527313,
  'end': 55,
  'id': 21175158,
  'title': 'Commonwealth of Nations'},
 {'spot': 'international',
  'start': 114,
  'link_probability': 0.003986390773206949,
  'rho': 0.16677074134349823,
  'end': 127,
  'id': 8195726,
  'title': 'International law'},
 {'spot': 'mandate',
  'start': 185,
  'link_probability': 0.03256174549460411,
  'rho': 0.1051044911146164,
  'end': 192,
  'id': 18139,
  'title': 'League of Nations mandate'},
 {'spot': 'UN Special Rapporteur on Human Rights',
  'start': 200,
  'link_probability': 0.375,
  'rho': 0.3059466779232025,
  'end': 237,
  'id': 1374719,
  'title': 'United Nations Special Rapporteur'},
 {'spot': 'Human Rights in Myanmar',
  'start': 225,
  '

In [67]:
# attempt to use tagme API to get entities

import requests
x = random.randint(0, len(question_df))
x = 18990
print(x)

print(question_df["text"].iloc[x])
address = f"https://tagme.d4science.org/tagme/tag?lang=en&gcube-token=5fcf52fb-79b2-4718-b7ee-62957df7d7e0-843339462&include_categories=true&include_abstract=true&text={question_df['text'].iloc[x]}"
response = requests.get(address)
annotations = response.json()["annotations"]
new_annotations = []
for i, spot in enumerate(annotations):
    # remove if rho is less than 0.1
    if spot["rho"] >= 0.1:
        new_annotations.append(spot)

# print keys
        
for spot in new_annotations:
    #print(spot["spot"], spot["rho"])
    print(spot)

18990
To ask the Secretary of State for Health and Social Care, what assessment he has made of the potential impact of the settlement decision of NHS Lancashire and South Cumbria Integrated Care Board not to grant additional funding to hospices on (a) patient care and (b) the financial sustainability of those hospices.
{'spot': 'Secretary of State', 'start': 11, 'link_probability': 0.15942679345607758, 'rho': 0.12373879551887512, 'dbpedia_categories': ['Political office-holders by role'], 'end': 29, 'id': 26422038, 'abstract': 'Secretary of State or State Secretary is a commonly used title for a senior or mid-level post in governments around the world. The role varies between countries, and in some cases there are multiple Secretaries of State in the Government.', 'title': 'Secretary of State'}
{'spot': 'decision', 'start': 128, 'link_probability': 0.002456112066283822, 'rho': 0.12014073878526688, 'dbpedia_categories': [], 'end': 136, 'id': 265752, 'abstract': 'This article deals with 

In [69]:
# attempt to use WAT API to get entities

import requests

x = random.randint(0, len(question_df))
x = 18990
print(x)
print(question_df["text"].iloc[x])

token = "5fcf52fb-79b2-4718-b7ee-62957df7d7e0-843339462"
address = f"https://wat.d4science.org/wat/tag/tag?lang=en&gcube-token={token}&include_categories=true&text={question_df['text'].iloc[x]}"
response = requests.get(address)
annotations = response.json()["annotations"]
new_annotations = []
for i, spot in enumerate(annotations):
    # remove if rho is less than 0.1
    if spot["rho"] >= 0.1:
        new_annotations.append(spot)

# print keys
        
for spot in new_annotations:
    #print(spot["spot"], spot["rho"])
    print(spot)

18990
To ask the Secretary of State for Health and Social Care, what assessment he has made of the potential impact of the settlement decision of NHS Lancashire and South Cumbria Integrated Care Board not to grant additional funding to hospices on (a) patient care and (b) the financial sustainability of those hospices.
{'spot': 'State', 'id': 31975, 'title': 'United_States_Department_of_State', 'start': 24, 'end': 29, 'rho': 0.43345128721802634}
{'spot': 'Lancashire', 'id': 50435, 'title': 'Lancashire', 'start': 144, 'end': 154, 'rho': 0.15363827810254613}
{'spot': 'Integrated Care', 'id': 25043987, 'title': 'Integrated_care', 'start': 173, 'end': 188, 'rho': 0.2736840452801378}


In [65]:
# Attempt to use REL API to get entities
import requests
x = random.randint(0, len(question_df))
print(x)
print(question_df["text"].iloc[x])
API_URL = "https://rel.cs.ru.nl/api"
text_doc = question_df["text"].iloc[x]

# Example ED.
ed_result = requests.post(API_URL, json={
    "text": text_doc,
}).json()
print(ed_result)

18990
To ask the Secretary of State for Health and Social Care, what assessment he has made of the potential impact of the settlement decision of NHS Lancashire and South Cumbria Integrated Care Board not to grant additional funding to hospices on (a) patient care and (b) the financial sustainability of those hospices.
[[34, 22, 'Health and Social Care', 'Health_and_Social_Care', 0.9239765086785631, 0.8034675270318985, 'ORG']]


In [64]:
district_df_cleaned[district_df_cleaned["districtLabel"] == "Reading East"]

Unnamed: 0,district,person,personLabel,districtLabel
582,http://www.wikidata.org/entity/Q1032131,http://www.wikidata.org/entity/Q30164903,Matt Rodda,Reading East
