In [1]:
"""
1. Extract a list of entites from MP's written questions
"""

"\n1. Extract a list of entites from MP's written questions\n"

In [12]:
"""
SPARQL query to api.parliament.uk endpoint to get all written questions

Altered so it does not contain questions from the House of Lords
"""


# https://rebeccabilbro.github.io/sparql-from-python/
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

question_df = pd.DataFrame()
sparql  = SPARQLWrapper("https://api.parliament.uk/sparql/")
sparql.setQuery("""
SELECT *
WHERE {
?question <https://id.parliament.uk/schema/writtenQuestionIndexingAndSearchUin> ?qnum .
?person <https://id.parliament.uk/schema/askingPersonHasQuestion> ?question .
?person <https://id.parliament.uk/schema/wikidataThingHasEquivalentWikidataResource> ?wikidataperson.
?question <https://id.parliament.uk/schema/questionText> ?text .
?question <https://id.parliament.uk/schema/questionAskedAt> ?date .
FILTER (?date >= "2023-01-01+00:00"^^xsd:dateTime && ?date < "2023-10-01+00:00"^^xsd:dateTime)
FILTER regex(?qnum, "^(?!HL)") 
}
""")

sparql.setReturnFormat(JSON)
result = sparql.query().convert()
# results -> bindings returns a list of dictionaries
result = result["results"]["bindings"]

# removed type from the results
for item in result:
    for key in item:
        item[key] = item[key]["value"]


In [3]:
question_df = pd.DataFrame(result)

# remove tags from text
question_df["text"] = question_df["text"].str.replace("<p>", "", )
question_df["text"] = question_df["text"].str.replace("</p>", "", )

question_df.head(3)

Unnamed: 0,question,qnum,person,wikidataperson,text,date
0,https://id.parliament.uk/0IwOO533,903422,https://id.parliament.uk/Vs3bGLNz,http://www.wikidata.org/entity/Q689287,Which three departments had the lowest proport...,2023-01-05+00:00
1,https://id.parliament.uk/dRJ9254g,117748,https://id.parliament.uk/eD0yd5Ec,http://www.wikidata.org/entity/Q19871931,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00
2,https://id.parliament.uk/e4IDRkKi,117410,https://id.parliament.uk/RlIqlixq,http://www.wikidata.org/entity/Q695228,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00


In [10]:
"""
Functions for named entity recognition and disambiguation
"""

import requests
import random
token = "5fcf52fb-79b2-4718-b7ee-62957df7d7e0-843339462"

def pick_random_question():
    x = random.randint(0, len(question_df))
    return question_df["text"].iloc[x]


def tag_me(text=None):
    if not text:
        text = pick_random_question()
        print(text)


    address = f"https://tagme.d4science.org/tagme/tag?lang=en&gcube-token={token}&include_categories=false&include_abstract=false&text={text}"
    response = requests.get(address)

    annotations = response.json()["annotations"]
    new_annotations = []

    for spot in annotations:
        # remove if rho is less than 0.1
        if spot["rho"] >= 0.1:
            new_annotations.append(spot)

    return new_annotations


def WAT(text=None):
    if not text:
        text = pick_random_question()
        #print(text)
       
    address = f"https://wat.d4science.org/wat/tag/tag?lang=en&gcube-token={token}&include_categories=true&text={text}"
    response = requests.get(address)
    annotations = response.json()["annotations"]
    new_annotations = []

    for spot in annotations:
        # remove if rho is less than 0.1
        if spot["rho"] >= 0.1:
            new_annotations.append(spot)
    return new_annotations


def REL(text=None):
    if not text:
        text = pick_random_question()
        print(text)
    API_URL = "https://rel.cs.ru.nl/api"

    # Example ED.
    ed_result = requests.post(API_URL, json={
        "text": text,
    })
    assert ed_result.status_code == 200
    
    return ed_result.json()

In [11]:
"""
Functions for getting a location from a wikipedia page and converting it to British National Grid units (Easting, Northing)
from longitude and latitude
"""

#https://stackoverflow.com/questions/40098656/how-to-get-coordinates-from-a-wikipedia-page-through-api
def get_wikipedia_location(wikipedia_title):
    response = requests.get(f"https://en.wikipedia.org/w/api.php?action=query&prop=coordinates&format=json&titles={wikipedia_title}").json()
    try :
        # makes iterator over pages and takes first element
        page = next(iter(response['query']['pages'].values()))
        return page['coordinates'][0]["lat"], page['coordinates'][0]["lon"]
    except KeyError:
        return None

def convert_to_BNG(location):
    result = requests.get(f"http://webapps.bgs.ac.uk/data/webservices/CoordConvert_LL_BNG.cfc?method=LatLongToBNG&lat={location[0]}&lon={location[1]}").json()
    return result["EASTING"], result["NORTHING"]


In [101]:
"""
Iterates through each question, extracts entities and then finds their location in BNG.

question_locations stores the question, entity, easting, northing and wikipedia link.

This is then stored in a dataframe and saved to a csv file, so it can be performed in chunks.
"""
import time
start = 20_000
end = 26_865
banned_entities = [ "United_Kingdom", "England", "Wales", "Northern_Ireland", "Scotland"]
question_locations = []

for index, row in question_df[start:end].iterrows():
    for entity in WAT(row["text"]):
        while True:
            try:
                if entity["title"] in banned_entities:
                    break

                location = get_wikipedia_location(entity["title"])
                # sleep for 0.1 seconds to avoid rate limiting
                
                if location is not None:
                    location = convert_to_BNG(location)

                    # origin is in the isles of scilly, and spans from 0 to 700,000 easting and 0 to 1,300,000 northing
                    if location[0] >= 0 and location[0] <= 700_000 and location[1] >= 0 and location[1] <= 1_300_000:
                        print(entity["title"])
                        question_locations.append({"question": row["question"], "entity": entity["title"], "easting": location[0], "northing": location[1], "wikipedia": f"https://en.wikipedia.org/wiki/{entity['title']}"})
                break
            except KeyError:
                print("time out")
                time.sleep(3)

question_locations_df = pd.DataFrame(question_locations)
question_locations_df.to_csv(f"question_entities/question_locations_{end}.csv")
question_locations

Cambridgeshire
Cabinet_Office
11_Downing_Street
Cabinet_Office
Hove
Cabinet_Office
National_Audit_Office_(United_Kingdom)
Glasgow
Northwood_Headquarters
Department_for_Transport
Gloucestershire
British_Transport_Police
Department_for_Transport
British_Transport_Police
British_Transport_Police
HM_Treasury
Cabinet_Office
Cabinet_Office
North_East_England
Department_for_Transport
Department_for_Transport
Department_for_Transport
Basildon
Billericay
Department_for_Transport
British_Transport_Police
Department_for_Transport
Department_for_Transport
Cabinet_Office
Department_for_Transport
British_Transport_Police
Department_for_Transport
Cabinet_Office
Department_for_Transport
Falmouth_Docks
HM_Treasury
Cabinet_Office
Department_for_Transport
Department_for_Transport
Cabinet_Office
North_East_England
Department_for_Transport
Cabinet_Office
Cabinet_Office
Office_for_National_Statistics
North_East_England
Cabinet_Office
St_Helens,_Merseyside
Falmouth_Docks
Cabinet_Office
Cabinet_Office
Kingsto

[{'question': 'https://id.parliament.uk/32D4KCWL',
  'entity': 'Cambridgeshire',
  'easting': 536372.951668183,
  'northing': 272465.472673855,
  'wikipedia': 'https://en.wikipedia.org/wiki/Cambridgeshire'},
 {'question': 'https://id.parliament.uk/rR6fI8rb',
  'entity': 'Cabinet_Office',
  'easting': 530118.174894204,
  'northing': 179960.770401966,
  'wikipedia': 'https://en.wikipedia.org/wiki/Cabinet_Office'},
 {'question': 'https://id.parliament.uk/rR6fI8rb',
  'entity': '11_Downing_Street',
  'easting': 530051.23676842,
  'northing': 179935.120979539,
  'wikipedia': 'https://en.wikipedia.org/wiki/11_Downing_Street'},
 {'question': 'https://id.parliament.uk/fEYgRIu8',
  'entity': 'Cabinet_Office',
  'easting': 530118.174894204,
  'northing': 179960.770401966,
  'wikipedia': 'https://en.wikipedia.org/wiki/Cabinet_Office'},
 {'question': 'https://id.parliament.uk/wkMoaqbq',
  'entity': 'Hove',
  'easting': 528552.041993163,
  'northing': 105548.472750481,
  'wikipedia': 'https://en.wi

In [27]:
"""
Iterates through the stored csv file and combines them into one dataframe
"""
import os
question_locations_df = pd.DataFrame()
for file in os.listdir("question_entities"):
    if file.endswith(".csv"):
        print(file)
        new_questions = pd.read_csv(f"question_entities/{file}")
        # reset the index

        new_questions = new_questions.drop(columns=["Unnamed: 0"])
        question_locations_df = pd.concat([question_locations_df, new_questions], ignore_index=True)
question_locations_df

question_locations_1000.csv
question_locations_2000.csv
question_locations_20000.csv
question_locations_26865.csv


Unnamed: 0,question,entity,easting,northing,wikipedia
0,https://id.parliament.uk/dmnAotxP,Office_for_National_Statistics,328820.179860,185811.595715,https://en.wikipedia.org/wiki/Office_for_Natio...
1,https://id.parliament.uk/wne3Q3kQ,West_Midlands_(region),382689.369764,286801.459728,https://en.wikipedia.org/wiki/West_Midlands_(r...
2,https://id.parliament.uk/Q0uNVDhz,Slough,498082.779241,179773.798368,https://en.wikipedia.org/wiki/Slough
3,https://id.parliament.uk/5utw5sA0,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4,https://id.parliament.uk/oHeIeP9G,River_Tees,455114.463816,528466.390692,https://en.wikipedia.org/wiki/River_Tees
...,...,...,...,...,...
4423,https://id.parliament.uk/51DqbupU,HM_Prison_Wandsworth,526728.635699,173925.158213,https://en.wikipedia.org/wiki/HM_Prison_Wandsw...
4424,https://id.parliament.uk/c7mhUeV3,Rathlin_Island,133639.823902,608129.542743,https://en.wikipedia.org/wiki/Rathlin_Island
4425,https://id.parliament.uk/Py4BvdeG,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4426,https://id.parliament.uk/Py4BvdeG,Downing_Street_mortar_attack,530021.784566,179958.304242,https://en.wikipedia.org/wiki/Downing_Street_m...


In [25]:
# prints out the most commonly occuring entity and a question it appears in
for entity in question_locations_df["entity"].value_counts().index:
    print(entity + " : " + str(question_locations_df[question_locations_df["entity"] == entity]["question"].iloc[0]))



Cabinet_Office : https://id.parliament.uk/5utw5sA0
Department_for_Transport : https://id.parliament.uk/nICBNPdP
HM_Treasury : https://id.parliament.uk/wjMCvZHL
London : https://id.parliament.uk/Q9BUzAPe
Ofcom : https://id.parliament.uk/e9R3Yc8s
Parliament_of_the_United_Kingdom : https://id.parliament.uk/icpOYIxj
North_East_England : https://id.parliament.uk/nGRSsnLs
Office_for_National_Statistics : https://id.parliament.uk/dmnAotxP
House_of_Commons_of_the_United_Kingdom : https://id.parliament.uk/9x3uMj5c
York : https://id.parliament.uk/geYqgbHj
Birmingham : https://id.parliament.uk/5kyl71y9
Portland_Harbour : https://id.parliament.uk/TSh8ReyK
Broxbourne_(UK_Parliament_constituency) : https://id.parliament.uk/sKmMqHM5
Yorkshire : https://id.parliament.uk/kPk1dHaz
Humber : https://id.parliament.uk/kPk1dHaz
Great_Britain : https://id.parliament.uk/1VbyL4VM
Coventry : https://id.parliament.uk/iGNnKsA7
West_Midlands_(region) : https://id.parliament.uk/wne3Q3kQ
Ofwat : https://id.parliament

In [28]:
# these entities are removed as they are there location is not relevant when a given question is asked
remove_entities = ["Office_for_National_Statistics", "Department_for_Transport", "HM_Treasury", "Cabinet Office", "House_of_Commons_of_the_United_Kingdom", "Parliament_of_the_United_Kingdom", "National_Audit_Office_(United_Kingdom)", "Government_Equalities_Office"]
question_locations_df = question_locations_df[~question_locations_df["entity"].isin(remove_entities)]
question_locations_df

Unnamed: 0,question,entity,easting,northing,wikipedia
1,https://id.parliament.uk/wne3Q3kQ,West_Midlands_(region),382689.369764,286801.459728,https://en.wikipedia.org/wiki/West_Midlands_(r...
2,https://id.parliament.uk/Q0uNVDhz,Slough,498082.779241,179773.798368,https://en.wikipedia.org/wiki/Slough
3,https://id.parliament.uk/5utw5sA0,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4,https://id.parliament.uk/oHeIeP9G,River_Tees,455114.463816,528466.390692,https://en.wikipedia.org/wiki/River_Tees
5,https://id.parliament.uk/W7K6780e,Hounslow,512974.190267,175455.283371,https://en.wikipedia.org/wiki/Hounslow
...,...,...,...,...,...
4423,https://id.parliament.uk/51DqbupU,HM_Prison_Wandsworth,526728.635699,173925.158213,https://en.wikipedia.org/wiki/HM_Prison_Wandsw...
4424,https://id.parliament.uk/c7mhUeV3,Rathlin_Island,133639.823902,608129.542743,https://en.wikipedia.org/wiki/Rathlin_Island
4425,https://id.parliament.uk/Py4BvdeG,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4426,https://id.parliament.uk/Py4BvdeG,Downing_Street_mortar_attack,530021.784566,179958.304242,https://en.wikipedia.org/wiki/Downing_Street_m...


In [None]:
"""
2. Link MP's to constituencies and their geometry
"""

In [8]:
"""
Produces dataframe of geometry of all westminister constituencies
"""
import geopandas as gpd
en_wl_sct = gpd.read_file("shape/westminster_const_region.shp")
ni = gpd.read_file("shape/nireland_aa_2008.shp")

en_wl_sct = en_wl_sct.rename(columns={"NAME": "name"})
uk_geometry = pd.concat([en_wl_sct[["name", "geometry"]], ni[["name", "geometry"]]])

In [4]:
"""
SPARQL query to wikidata endpoint to get all MPs and their constituencies

"""

district_df = pd.DataFrame()
sparql  = SPARQLWrapper("https://query.wikidata.org/sparql")
sparql.setQuery("""
SELECT ?person ?district ?personLabel ?districtLabel
WHERE
{
  ?person p:P39 ?position.
  ?position ps:P39 wd:Q77685926.
  ?position pq:P768 ?district
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
""")

sparql.setReturnFormat(JSON)
result = sparql.query().convert()
# results -> bindings returns a list of dictionaries
result = result["results"]["bindings"]

# removed type from the results
for item in result:
    for key in item:
        item[key] = item[key]["value"]
district_df = pd.DataFrame(result)
district_df.head(3)

Unnamed: 0,district,person,personLabel,districtLabel
0,http://www.wikidata.org/entity/Q874686,http://www.wikidata.org/entity/Q264426,Peter Bone,Wellingborough
1,http://www.wikidata.org/entity/Q3336813,http://www.wikidata.org/entity/Q264766,Theresa May,Maidenhead
2,http://www.wikidata.org/entity/Q3335586,http://www.wikidata.org/entity/Q266078,Fiona Bruce,Congleton


In [5]:
# remove duplicate records (which can occur when a person changes parties i.e Julian Lewis)
district_df_cleaned = district_df.drop_duplicates()
district_df_cleaned = district_df_cleaned.astype({"districtLabel": "object", "personLabel": "object"})

# I have decided not remove duplicate people for the same constituency, which can happen if a bi-election occurs.
# This is because either of them could have asked questions.

In [6]:
# Check if there are any people who have asked questions but are not in the district_df
question_df[~question_df["wikidataperson"].isin(district_df_cleaned["person"])]

Unnamed: 0,question,qnum,person,wikidataperson,text,date


In [9]:
"""
Makes the names of the constituencies consistent between uk_geometry and district_df_cleaned
"""

uk_geometry["name"] = uk_geometry["name"].str.replace(", ", " ")
uk_geometry["name"] = uk_geometry["name"].str.replace("Co Const", "")
uk_geometry["name"] = uk_geometry["name"].str.replace("Boro Const", "")
uk_geometry["name"] = uk_geometry["name"].str.replace("Burgh Const", "")
uk_geometry["name"] = uk_geometry["name"].str.replace(".", "")
uk_geometry["name"] = uk_geometry["name"].str.rstrip()
uk_geometry["name"] = uk_geometry["name"].str.replace("Weston-Super-Mare", "Weston-super-Mare")
uk_geometry = uk_geometry[["name", "geometry"]]


district_df_cleaned["districtLabel"] = district_df_cleaned["districtLabel"].str.replace(", ", " ")
district_df_cleaned["districtLabel"] = district_df_cleaned["districtLabel"].str.replace("ô", "o")

In [10]:

MP_districts = pd.merge(district_df_cleaned, uk_geometry, left_on="districtLabel", right_on="name", how="inner")
print(len(MP_districts["districtLabel"].unique()))

650


In [480]:
"""
Check for any constituencies present in one but not the other
"""
for name in uk_geometry["name"].unique():
    if name not in MP_districts["districtLabel"].unique():
        print(name)

for district in district_df_cleaned["districtLabel"].unique():
    if name not in MP_districts["name"].unique():
        print(name)

In [483]:
'''
3. Relate MP's questions to constituencies mentioned in the question
'''

"\n3. Relate MP's questions to constituencies mentioned in the question\n"

In [14]:
'''
Checks for each entity which constituency it belongs and then adds it to the new district column
'''

import shapely

# help from https://stackoverflow.com/questions/7861196/check-if-a-geopoint-with-latitude-and-longitude-is-within-a-shapefile

# adds a new column to question_locations_df
question_locations_df["district"] = None


for index, row in question_locations_df.iterrows():

    point = shapely.geometry.Point(row["easting"], row["northing"])
    for index2, row2 in MP_districts.iterrows():
        if row2["geometry"].contains(point):
            print(index)
            question_locations_df.at[index, "district"] = row2["districtLabel"]
            break

0
1
2
3
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279


In [None]:
questi

In [30]:
# show entities that have null
question_locations_df[question_locations_df["district"].isnull()]["entity"].unique()

KeyError: 'const'

In [16]:
question_locations_df["district"].value_counts() 

district
Cities of London and Westminster    1844
Bermondsey and Old Southwark          66
Skipton and Ripon                     60
Birmingham Ladywood                   58
Hexham                                45
                                    ... 
Glasgow North                          1
South Swindon                          1
Wyre and Preston North                 1
Barrow and Furness                     1
Hartlepool                             1
Name: count, Length: 399, dtype: int64

question_locations_df.head()