In [None]:
"""
1. Extract a list of entites from MP's written questions
"""

In [8]:
"""
SPARQL query to api.parliament.uk endpoint to get all written questions

Altered so it does not contain questions from the House of Lords
"""


# https://rebeccabilbro.github.io/sparql-from-python/
from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

question_df = pd.DataFrame()
sparql  = SPARQLWrapper("https://api.parliament.uk/sparql/")
sparql.setQuery("""
SELECT *
WHERE {
?question <https://id.parliament.uk/schema/writtenQuestionIndexingAndSearchUin> ?qnum .
?person <https://id.parliament.uk/schema/askingPersonHasQuestion> ?question .
?person <https://id.parliament.uk/schema/wikidataThingHasEquivalentWikidataResource> ?wikidataperson.
?question <https://id.parliament.uk/schema/questionText> ?text .
?question <https://id.parliament.uk/schema/questionAskedAt> ?date .
FILTER (?date >= "2023-01-01+00:00"^^xsd:dateTime && ?date < "2023-10-01+00:00"^^xsd:dateTime)
FILTER regex(?qnum, "^(?!HL)") 
}
""")

sparql.setReturnFormat(JSON)
result = sparql.query().convert()
# results -> bindings returns a list of dictionaries
result = result["results"]["bindings"]

# removed type from the results
for item in result:
    for key in item:
        item[key] = item[key]["value"]


In [9]:
question_df = pd.DataFrame(result)

# remove tags from text
question_df["text"] = question_df["text"].str.replace("<p>", "", )
question_df["text"] = question_df["text"].str.replace("</p>", "", )

question_df.head(3)

Unnamed: 0,question,qnum,person,wikidataperson,text,date
0,https://id.parliament.uk/0IwOO533,903422,https://id.parliament.uk/Vs3bGLNz,http://www.wikidata.org/entity/Q689287,Which three departments had the lowest proport...,2023-01-05+00:00
1,https://id.parliament.uk/dRJ9254g,117748,https://id.parliament.uk/eD0yd5Ec,http://www.wikidata.org/entity/Q19871931,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00
2,https://id.parliament.uk/e4IDRkKi,117410,https://id.parliament.uk/RlIqlixq,http://www.wikidata.org/entity/Q695228,"To ask the Secretary of State for Foreign, Com...",2023-01-06+00:00


In [10]:
"""
Functions for named entity recognition and disambiguation
"""

import requests
import random
token = "5fcf52fb-79b2-4718-b7ee-62957df7d7e0-843339462"

def pick_random_question():
    x = random.randint(0, len(question_df))
    return question_df["text"].iloc[x]


def tag_me(text=None):
    if not text:
        text = pick_random_question()
        print(text)


    address = f"https://tagme.d4science.org/tagme/tag?lang=en&gcube-token={token}&include_categories=false&include_abstract=false&text={text}"
    response = requests.get(address)

    annotations = response.json()["annotations"]
    new_annotations = []

    for spot in annotations:
        # remove if rho is less than 0.1
        if spot["rho"] >= 0.1:
            new_annotations.append(spot)

    return new_annotations


def WAT(text=None):
    if not text:
        text = pick_random_question()
        #print(text)
       
    address = f"https://wat.d4science.org/wat/tag/tag?lang=en&gcube-token={token}&include_categories=true&text={text}"
    response = requests.get(address)
    annotations = response.json()["annotations"]
    new_annotations = []

    for spot in annotations:
        # remove if rho is less than 0.1
        if spot["rho"] >= 0.1:
            new_annotations.append(spot)
    return new_annotations


def REL(text=None):
    if not text:
        text = pick_random_question()
        print(text)
    API_URL = "https://rel.cs.ru.nl/api"

    # Example ED.
    ed_result = requests.post(API_URL, json={
        "text": text,
    })
    assert ed_result.status_code == 200
    
    return ed_result.json()

In [11]:
"""
Functions for getting a location from a wikipedia page and converting it to British National Grid units (Easting, Northing)
from longitude and latitude
"""

#https://stackoverflow.com/questions/40098656/how-to-get-coordinates-from-a-wikipedia-page-through-api
def get_wikipedia_location(wikipedia_title):
    response = requests.get(f"https://en.wikipedia.org/w/api.php?action=query&prop=coordinates&format=json&titles={wikipedia_title}").json()
    try :
        # makes iterator over pages and takes first element
        page = next(iter(response['query']['pages'].values()))
        return page['coordinates'][0]["lat"], page['coordinates'][0]["lon"]
    except KeyError:
        return None

def convert_to_BNG(location):
    result = requests.get(f"http://webapps.bgs.ac.uk/data/webservices/CoordConvert_LL_BNG.cfc?method=LatLongToBNG&lat={location[0]}&lon={location[1]}").json()
    return result["EASTING"], result["NORTHING"]


In [6]:
"""
Iterates through each question, extracts entities and then finds their location in BNG.

question_locations stores the question, entity, easting, northing and wikipedia link.

This is then stored in a dataframe and saved to a csv file, so it can be performed in chunks.
"""
import time
start = 0
end = 1
banned_entities = [ "United_Kingdom", "England", "Wales", "Northern_Ireland", "Scotland"]
question_locations = []

for index, row in question_df[start:end].iterrows():
    for entity in WAT(row["text"]):
        while True:
            try:
                if entity["title"] in banned_entities:
                    break

                location = get_wikipedia_location(entity["title"])
                # sleep for 0.1 seconds to avoid rate limiting
                
                if location is not None:
                    location = convert_to_BNG(location)

                    # origin is in the isles of scilly, and spans from 0 to 700,000 easting and 0 to 1,300,000 northing
                    if location[0] >= 0 and location[0] <= 700_000 and location[1] >= 0 and location[1] <= 1_300_000:
                        print(entity["title"])
                        question_locations.append({"question": row["question"], "entity": entity["title"], "easting": location[0], "northing": location[1], "wikipedia": f"https://en.wikipedia.org/wiki/{entity['title']}"})
                break
            except KeyError:
                print("time out")
                time.sleep(3)

question_locations_df = pd.DataFrame(question_locations)
question_locations_df.to_csv(f"question_entities/question_locations_{end}.csv")
question_locations

[]

In [12]:
"""
Iterates through the stored csv file and combines them into one dataframe
"""
import os
question_locations_df = pd.DataFrame()
for file in os.listdir("question_entities"):
    if file.endswith(".csv"):
        print(file)
        new_questions = pd.read_csv(f"question_entities/{file}")
        # reset the index

        new_questions = new_questions.drop(columns=["Unnamed: 0"])
        question_locations_df = pd.concat([question_locations_df, new_questions], ignore_index=True)
question_locations_df

question_locations_1000.csv
question_locations_2000.csv
question_locations_20000.csv
question_locations_26865.csv


Unnamed: 0,question,entity,easting,northing,wikipedia
0,https://id.parliament.uk/dmnAotxP,Office_for_National_Statistics,328820.179860,185811.595715,https://en.wikipedia.org/wiki/Office_for_Natio...
1,https://id.parliament.uk/wne3Q3kQ,West_Midlands_(region),382689.369764,286801.459728,https://en.wikipedia.org/wiki/West_Midlands_(r...
2,https://id.parliament.uk/Q0uNVDhz,Slough,498082.779241,179773.798368,https://en.wikipedia.org/wiki/Slough
3,https://id.parliament.uk/5utw5sA0,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4,https://id.parliament.uk/oHeIeP9G,River_Tees,455114.463816,528466.390692,https://en.wikipedia.org/wiki/River_Tees
...,...,...,...,...,...
4423,https://id.parliament.uk/51DqbupU,HM_Prison_Wandsworth,526728.635699,173925.158213,https://en.wikipedia.org/wiki/HM_Prison_Wandsw...
4424,https://id.parliament.uk/c7mhUeV3,Rathlin_Island,133639.823902,608129.542743,https://en.wikipedia.org/wiki/Rathlin_Island
4425,https://id.parliament.uk/Py4BvdeG,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4426,https://id.parliament.uk/Py4BvdeG,Downing_Street_mortar_attack,530021.784566,179958.304242,https://en.wikipedia.org/wiki/Downing_Street_m...


In [13]:
# prints out the most commonly occuring entity and a question it appears in
for entity in question_locations_df["entity"].value_counts().index:
    print(entity + " : " + str(question_locations_df[question_locations_df["entity"] == entity]["question"].iloc[0]))

Cabinet_Office : https://id.parliament.uk/5utw5sA0
Department_for_Transport : https://id.parliament.uk/nICBNPdP
HM_Treasury : https://id.parliament.uk/wjMCvZHL
London : https://id.parliament.uk/Q9BUzAPe
Ofcom : https://id.parliament.uk/e9R3Yc8s
Parliament_of_the_United_Kingdom : https://id.parliament.uk/icpOYIxj
North_East_England : https://id.parliament.uk/nGRSsnLs
Office_for_National_Statistics : https://id.parliament.uk/dmnAotxP
House_of_Commons_of_the_United_Kingdom : https://id.parliament.uk/9x3uMj5c
York : https://id.parliament.uk/geYqgbHj
Birmingham : https://id.parliament.uk/5kyl71y9
Portland_Harbour : https://id.parliament.uk/TSh8ReyK
Broxbourne_(UK_Parliament_constituency) : https://id.parliament.uk/sKmMqHM5
Yorkshire : https://id.parliament.uk/kPk1dHaz
Humber : https://id.parliament.uk/kPk1dHaz
Great_Britain : https://id.parliament.uk/1VbyL4VM
Coventry : https://id.parliament.uk/iGNnKsA7
West_Midlands_(region) : https://id.parliament.uk/wne3Q3kQ
Ofwat : https://id.parliament

In [14]:
# these entities are removed as they are there location is not relevant when a given question is asked
remove_entities = ["Office_for_National_Statistics", "Department_for_Transport", "HM_Treasury", "Cabinet Office", "House_of_Commons_of_the_United_Kingdom", "Parliament_of_the_United_Kingdom", "National_Audit_Office_(United_Kingdom)", "Government_Equalities_Office"]
question_locations_df = question_locations_df[~question_locations_df["entity"].isin(remove_entities)]
question_locations_df

Unnamed: 0,question,entity,easting,northing,wikipedia
1,https://id.parliament.uk/wne3Q3kQ,West_Midlands_(region),382689.369764,286801.459728,https://en.wikipedia.org/wiki/West_Midlands_(r...
2,https://id.parliament.uk/Q0uNVDhz,Slough,498082.779241,179773.798368,https://en.wikipedia.org/wiki/Slough
3,https://id.parliament.uk/5utw5sA0,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4,https://id.parliament.uk/oHeIeP9G,River_Tees,455114.463816,528466.390692,https://en.wikipedia.org/wiki/River_Tees
5,https://id.parliament.uk/W7K6780e,Hounslow,512974.190267,175455.283371,https://en.wikipedia.org/wiki/Hounslow
...,...,...,...,...,...
4423,https://id.parliament.uk/51DqbupU,HM_Prison_Wandsworth,526728.635699,173925.158213,https://en.wikipedia.org/wiki/HM_Prison_Wandsw...
4424,https://id.parliament.uk/c7mhUeV3,Rathlin_Island,133639.823902,608129.542743,https://en.wikipedia.org/wiki/Rathlin_Island
4425,https://id.parliament.uk/Py4BvdeG,Cabinet_Office,530118.174894,179960.770402,https://en.wikipedia.org/wiki/Cabinet_Office
4426,https://id.parliament.uk/Py4BvdeG,Downing_Street_mortar_attack,530021.784566,179958.304242,https://en.wikipedia.org/wiki/Downing_Street_m...


In [16]:
# save the dataframe to a csv file
question_locations_df.to_csv("question_locations_full.csv", index=False)