In [7]:
from selenium import webdriver
import requests
import json
import numpy as np
import os


def scrape_data(state, n_pages, data_path, driver_path):
    """
    Refresh the json data folders of petition data (petition type dependent)
    
    state: petition type ['archived'/'closed'/'rejected'/'open']
    n_pages: number of pages on the website e.g. 563
    """
    
    # To prevent download dialog
    profile = webdriver.FirefoxProfile()
    profile.set_preference('browser.download.folderList', 2) # custom location
    profile.set_preference('browser.download.manager.showWhenStarting', False)
    profile.set_preference('browser.download.dir', data_path)
    profile.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv')

    browser = webdriver.Firefox(profile,executable_path=driver_path)

    for i in np.arange(0, n_pages + 1, 1):
        browser.get("https://petition.parliament.uk/archived/petitions?page=" + str(i) + "&state=" + state)

        page_tag = browser.find_element_by_xpath('/html/body/main/div/div/a/span[2]').text.replace(" ", "")

        # Move to JSON page
        browser.find_element_by_xpath('//*[text() = "JSON"]').click()

        # Download JSON
        data = requests.get(browser.current_url).json()

        # Save json to file
        with open(data_path + 'data_' + page_tag + '.json', 'w') as f:
            json.dump(data, f)

        if i == n_pages:
            # Last page has to be saved differently as the button changes
            data = requests.get(browser.current_url).json()
            page_tag = str(n_pages) + "of" + str(n_pages)
            # Save json to file
            with open(data_path + 'data_' + page_tag + '.json', 'w') as f:
                json.dump(data, f)
    
    browser.close()

# Parameters
n_pages = 563 # Check number of pages on archives
state = 'archived' # other states available
parent_dir = '/home/will/Datasets/'
driver_path = '/home/will/Projects/GovPetitionsUK/gov_uk_petitions_analysis/geckodriver'

# Setup folders
directory = 'petitions_website/'
data_path = os.path.join(parent_dir, directory)
os.mkdir(data_path)
directory = 'petitions_website/' + state + '/'
data_path = os.path.join(parent_dir, directory)
os.mkdir(data_path)

# Scrape the data
scrape_data(state, n_pages, data_path, driver_path)       

  profile = webdriver.FirefoxProfile()
  browser = webdriver.Firefox(profile,executable_path=driver_path)
  browser = webdriver.Firefox(profile,executable_path=driver_path)
  page_tag = browser.find_element_by_xpath('/html/body/main/div/div/a/span[2]').text.replace(" ", "")
  browser.find_element_by_xpath('//*[text() = "JSON"]').click()


In [2]:
# Useful functions

In [1]:
import numpy

ModuleNotFoundError: No module named 'numpy'

In [1]:
# Convert json files into single csv
import os
import json
import pandas as pd

def convert_json_to_df(data_file):
    with open(data_file, "r") as read_file:
        data = json.load(read_file)

    data = pd.json_normalize(data['data'])
    return data

def get_files_in_folders(base_path, types):
    all_file_paths = []
    for i in types:
        files = os.listdir(base_path + i)
        full_path_files = [base_path + i + '/' + file for file in files]
        all_file_paths.extend(full_path_files)
    return all_file_paths

def extract_specific_attribute(full_file_paths, attributes):
    list_of_dataframes = []
    for data_file in full_file_paths:
        df = convert_json_to_df(data_file)
        columns_to_extract = df.columns[df.columns.isin(attributes)] # Only filter attribute if present
        list_of_dataframes.append(df[columns_to_extract]) 
    return combine_and_tidy_dfs(list_of_dataframes)


def combine_and_tidy_dfs(list_of_dataframes, indexer='attributes.created_at'):
    df = pd.concat(list_of_dataframes)
    if df.columns[df.columns.isin([indexer])].any():
        df[indexer] = pd.to_datetime(df[indexer])
        df = df.set_index(indexer)
        df = df.sort_index()
    return df

ModuleNotFoundError: No module named 'pandas'

# Number of votes over the history of online petitions

In [None]:
types = ['archived','closed']
base_path = '/home/william/Datasets/petitions_website/'
attributes = ['id','attributes.signature_count', 'attributes.created_at']
full_file_paths = get_files_in_folders(base_path, types)
full_df = extract_specific_attribute(full_file_paths, attributes)


In [4]:
%matplotlib
import matplotlib.pyplot as plt
fig, ax = plt.subplots(2,1, figsize=(20,10), sharex=True)
full_df['weekly_sum_of_signatures'] = full_df['attributes.signature_count'].rolling('7D', closed='left').sum()

full_df[['weekly_sum_of_signatures']].plot(rot=45,alpha=0.5, ax=ax[0], color='r')
full_df[['attributes.signature_count']].plot(rot=45,alpha=0.5, ax=ax[1], color='b')

ax[1].set_ylabel('Number of votes')
ax[0].set_ylabel('Number of votes')
ax[1].set_xlabel('Date')

Using matplotlib backend: Qt5Agg


Text(0.5, 0, 'Date')

# List all attributes

In [74]:
types = ['archived','closed']
base_path = '/home/william/Datasets/petitions_website/'
full_file_paths = get_files_in_folders(base_path, types)

def list_all_attributes(full_file_paths):
    list_of_attributes = []
    for data_file in full_file_paths:
        #print(data_file)
        df = convert_json_to_df(data_file).dropna(how='all', axis=1)
        #print(df.columns)
        list_of_attributes.extend(list(df.columns))
    return set(list_of_attributes)

ls_attr = list_all_attributes(full_file_paths)

In [75]:
ls_attr

{'attributes.action',
 'attributes.additional_details',
 'attributes.background',
 'attributes.closed_at',
 'attributes.committee_note',
 'attributes.created_at',
 'attributes.debate.debate_pack_url',
 'attributes.debate.debated_on',
 'attributes.debate.overview',
 'attributes.debate.transcript_url',
 'attributes.debate.video_url',
 'attributes.debate_outcome_at',
 'attributes.debate_threshold_reached_at',
 'attributes.departments',
 'attributes.government_response.created_at',
 'attributes.government_response.details',
 'attributes.government_response.responded_on',
 'attributes.government_response.summary',
 'attributes.government_response.updated_at',
 'attributes.government_response_at',
 'attributes.moderation_threshold_reached_at',
 'attributes.opened_at',
 'attributes.rejected_at',
 'attributes.rejection.code',
 'attributes.rejection.details',
 'attributes.response_threshold_reached_at',
 'attributes.scheduled_debate_date',
 'attributes.signature_count',
 'attributes.state',
 'a

# Number of debated petitions

In [78]:
types = ['archived','closed']
base_path = '/home/william/Datasets/petitions_website/'
attributes = ['id','attributes.signature_count', 'attributes.created_at', 'attributes.scheduled_debate_date',  
              'attributes.debate.debate_pack_url',
              'attributes.debate.debated_on',
              'attributes.debate.overview',
              'attributes.debate.transcript_url',
              'attributes.debate.video_url',
              'attributes.debate_outcome_at',
              'attributes.debate_threshold_reached_at']
full_file_paths = get_files_in_folders(base_path, types)
full_df = extract_specific_attribute(full_file_paths, attributes)


607

In [79]:
len(full_df)

30196

In [80]:
print("Total debated petitions since 2017 :", len(full_df['attributes.scheduled_debate_date'].dropna()))

"""
Except, the gov website has 71 parliamentary petitions debated since June 2020. Therefore this attribute isn't
reliable.

Try others...
"""

for col in full_df.columns:
    print(col, len(full_df[col].dropna()))

Total debated petitions since 2017 : 43
id 30196
attributes.signature_count 30196
attributes.debate_threshold_reached_at 39
attributes.scheduled_debate_date 43
attributes.debate_outcome_at 60
attributes.debate.debated_on 54
attributes.debate.transcript_url 60
attributes.debate.video_url 60
attributes.debate.debate_pack_url 60
attributes.debate.overview 60


In [66]:
len(full_df)

18846

# Validate no data is lost during preprocessing

In [81]:
types = ['archived','closed']
base_path = '/home/william/Datasets/petitions_website/'
attributes = ['id','attributes.signature_count', 'attributes.created_at']
full_file_paths = get_files_in_folders(base_path, types)
len(full_file_paths)

607

In [93]:


attributes = ['attributes.scheduled_debate_date',  
              'attributes.debate.debate_pack_url',
              'attributes.debate.debated_on',
              'attributes.debate.overview',
              'attributes.debate.transcript_url',
              'attributes.debate.video_url',
              'attributes.debate_outcome_at',
              'attributes.debate_threshold_reached_at']
out_list = extract_specific_attribute(full_file_paths, attributes)
len(out_list)

30196

30450

In [96]:
563*50



28150