In [2]:
import os, sys

# Add repo
git_dir = os.path.abspath('../')
sys.path.append(os.path.join(git_dir, 'lib', 'utils') )

# Define data output path
data_output_path = os.path.join(git_dir, 'data', 'remotes', 'hansard-urls')

# Make output dir
if not os.path.exists(data_output_path):
    os.makedirs(data_output_path)

# Gets the URLS for Downloading the Hansard for a particular year

Most of the work was done here, Thanks Tim Sherratt, you're amazing! (https://timsherratt.org)
https://github.com/GLAM-Workbench/australian-commonwealth-hansard

In [3]:
import re
import os
import time
import math
import requests
import arrow
import csv
import pandas as pd
from tqdm import tqdm
from bs4 import BeautifulSoup
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('https://', HTTPAdapter(max_retries=retries))
s.mount('http://', HTTPAdapter(max_retries=retries))

In [4]:
URLS = {
        'hofreps': (
            'http://parlinfo.aph.gov.au/parlInfo/search/summary/summary.w3p;'
            'adv=yes;orderBy=date-eLast;page={page};'
            'query={query}%20Dataset%3Ahansardr,hansardr80;resCount=100'),
        'senate': (
            'http://parlinfo.aph.gov.au/parlInfo/search/summary/summary.w3p;'
            'adv=yes;orderBy=date-eLast;page={page};'
            'query={query}%20Dataset%3Ahansards,hansards80;resCount=100')
}

In [5]:
# write dictionary to csv
# https://stackoverflow.com/questions/3086973/how-do-i-convert-this-list-of-dictionaries-to-a-csv-file
def dict_to_csv(input_dict : dict, output_file : str,): 
    with open(output_file, 'w', newline='')  as of:
        dict_writer = csv.DictWriter(of, input_dict[0].keys())
        dict_writer.writeheader()
        dict_writer.writerows(input_dict)


def get_total_results(house, query):
    '''
    Get the total number of results in the search.
    '''
    # Insert query and page values into the ParlInfo url
    url = URLS[house].format(query=query, page=0)
    # Get the results page
    response = s.get(url)
    # Parse the HTML
    soup = BeautifulSoup(response.text)
    try:
        # Find where the total results are given in the HTML
        summary = soup.find('div', 'resultsSummary').contents[1].string
        # Extract the number of results from the string
        total = re.search(r'of (\d+)', summary).group(1)
    except AttributeError:
        total = 0
    return int(total)

def get_xml_url(url):
    '''
    Extract the XML file url from an individual result.
    '''
    # Load the page for an individual result
    response = s.get(url)
    # Parse the HTML
    soup = BeautifulSoup(response.text)
    # Find the XML url by looking for a pattern in the href
    xml_url = soup.find('a', href=re.compile('toc_unixml'))['href']
    return xml_url

def number_of_results(house, year):
    '''
    Loop through a search by house and year, finding all the urls for XML files.
    '''
    # Format the start and end dates
    start_date = '01%2F01%2F{}'.format(year)
    end_date = '31%2F12%2F{}'.format(year)
    # Prepare the query value using the start and end dates
    query = 'Date%3A{}%20>>%20{}'.format(start_date, end_date)
    # Get the total results
    total_results = get_total_results(house, query)
    xml_urls = []
    dates = []
    return total_results
    
def harvest_year(house, year):
    '''
    Loop through a search by house and year, finding all the urls for XML files.
    '''
    # Format the start and end dates
    start_date = '01%2F01%2F{}'.format(year)
    end_date = '31%2F12%2F{}'.format(year)
    # Prepare the query value using the start and end dates
    query = 'Date%3A{}%20>>%20{}'.format(start_date, end_date)
    # Get the total results
    total_results = get_total_results(house, query)
    xml_urls = []
    dates = []
    if total_results > 0:
        # Calculate the number of pages in the results set
        num_pages = int(math.ceil(total_results / 100))
        # Loop through the page range
        for page in range(0, num_pages + 1):
            # Get the next page of results
            url = URLS[house].format(query=query, page=page)
            response = s.get(url)
            # Parse the HTML
            soup = BeautifulSoup(response.text)
            # Find the list of results and loop through them
            for result in (soup.find_all('div', 'resultContent')):
                # Try to identify the date
                try:
                    date = re.search(r'Date: (\d{2}\/\d{2}\/\d{4})', result.find('div', 'sumMeta').get_text()).group(1)
                    date = arrow.get(date, 'DD/MM/YYYY').format('YYYY-MM-DD')
                except AttributeError:
                    #There are some dodgy dates -- we'll just ignore them
                    date = None
                # If there's a date, and we haven't seen it already, we'll grab the details
                if date and date not in dates:
                    dates.append(date)
                    # Get the link to the individual result page
                    # This is where the XML file links live
                    result_link = result.find('div', 'sumLink').a['href']
                    # Get the XML file link from the individual record page
                    xml_url = get_xml_url(result_link)
                    # Save dates and links
                    xml_urls.append({'date': date, 'url': 'https://parlinfo.aph.gov.au{}'.format(xml_url)})
                    time.sleep(1)
            time.sleep(1)
    return xml_urls

In [7]:
YEAR = 2018

def get_and_write_urls(year : int):
    print("Processing Year {}".format(year))
    try:
        senate_urls = harvest_year('senate', year)
    except:
        senate_urls = []
    try:
        horeps_urls = harvest_year('hofreps', year)
    except:
        horeps_urls = []
        
    print("Number of results: Senate => {} House of Reps => {}".format(len(senate_urls), len(horeps_urls)))

    senate_output_file = data_output_path+"/{}-au-hansard-senate.csv".format(year)
    horeps_output_file = data_output_path+"/{}-au-hansard-hofreps.csv".format(year)

    clean_url = lambda x: x['url'].split(";")[0]+'\n'

    with open(senate_output_file, "w") as output:
        output.writelines(map(clean_url, senate_urls))

    with open(horeps_output_file, "w") as output:
        output.writelines(map(clean_url, horeps_urls))

In [None]:
for y in reversed(range(1979, 2009)):
    get_and_write_urls(y)

Processing Year 2008
Number of results: Senate => 0 House of Reps => 69
Processing Year 2007
Number of results: Senate => 41 House of Reps => 0
Processing Year 2006
