In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
import requests
import json
import numpy as np
import os


def scrape_data(data_path, driver_path, state='archived', headless=True):
    """
    Refresh the json data folders of petition data (petition type dependent).
    
    Note: The website JSON files contain more attributes than the CSV files.
    
    Parameters
    ----------
    state       : petition type ['archived'/'closed'/'rejected'/'open']
    data_path   : location to save petition json data
    driver_path : location of web driver (currently Mozilla)
    headless    : whether or not you seen the browser popup
    """
    
    # To prevent download dialog
    options = Options()
    options.set_preference('browser.download.folderList', 2) # custom location
    options.set_preference('browser.download.manager.showWhenStarting', False)
    options.set_preference('browser.download.dir', data_path)
    options.set_preference('browser.helperApps.neverAsk.saveToDisk', 'text/csv')
    
    # Without pop-up browser window (faster)
    if headless:
        options.add_argument("--headless")
      
    #service = Service(driver_path)

    #browser = webdriver.Firefox(service=service, options=options)
    browser = webdriver.Firefox(options=options)

    # Find total number of pages
    browser.get("https://petition.parliament.uk/petitions?state=all")
    page_count = browser.find_element_by_xpath('/html/body/main/div/div/a/span[2]')
    n_pages = int(page_count.text.split(' ')[-1])
    
    # Loop through pages and scrape petition data
    for i in np.arange(0, n_pages + 1, 1):
        browser.get("https://petition.parliament.uk/petitions?page=" + str(i) + "&state=" + state)

        page_tag = browser.find_element_by_xpath('/html/body/main/div/div/a/span[2]').text.replace(" ", "_")

        # Move to JSON page
        browser.find_element_by_xpath('//*[text() = "JSON"]').click()

        # Download JSON
        data = requests.get(browser.current_url).json()

        # Save json to file
        with open(data_path + 'data_' + page_tag + '.json', 'w') as f:
            json.dump(data, f)

        if i == n_pages:
            # Last page has to be saved differently as the button changes
            data = requests.get(browser.current_url).json()
            page_tag = str(n_pages) + "of" + str(n_pages)
            # Save json to file
            with open(data_path + 'data_' + page_tag + '.json', 'w') as f:
                json.dump(data, f)
    
    browser.close()


def setup_folders(data_dir, driver_path, base_directory = 'petitions_website/', state = 'all'):
    data_path = os.path.join(data_dir, base_directory)
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    directory = base_directory + state + '/'
    data_path = os.path.join(data_dir, directory)
    if not os.path.exists(data_path):
        os.mkdir(data_path)
    return data_path


# Parameters
state = 'all'
data_dir = '/home/will/Datasets/'
driver_path = '/home/will/Projects/GovPetitionsUK/gov_uk_petitions_analysis/geckodriver'

# Setup folders
data_path = setup_folders(data_dir, driver_path, state = state)

# Scrape the data
scrape_data(data_path, driver_path, state = state)

Check project folder for downloaded petition data

1. Set file paths
2. Check downloads folder