In [1]:
import json
from urllib.request import urlopen

In [2]:
# main indicies for hansard are as follows:
# http://hansard.millbanksystems.com/
# http://hansard.millbanksystems.com/volumes/
# http://hansard.millbanksystems.com/lords/
# http://hansard.millbanksystems.com/commons/
# http://hansard.millbanksystems.com/westminster_hall/
# http://hansard.millbanksystems.com/written_answers/
# http://hansard.millbanksystems.com/lords_reports/
# http://hansard.millbanksystems.com/grand_committee_report/
# http://hansard.millbanksystems.com/people/
# http://hansard.millbanksystems.com/constituencies/
# http://hansard.millbanksystems.com/offices/
# http://hansard.millbanksystems.com/acts/
# http://hansard.millbanksystems.com/bills/
# http://hansard.millbanksystems.com/divisions/

In [3]:
# sittings provides a list of all the sittings on a given date
sittings = json.loads(urlopen("http://hansard.millbanksystems.com/sittings/2001/nov/13.js").read().decode('utf-8'))
sittingKinds = [kind.keys() for kind in sittings]
# there are a bunch of different sittings on a given date:
sittingKinds

[dict_keys(['house_of_commons_sitting']),
 dict_keys(['westminster_hall_sitting']),
 dict_keys(['commons_written_answers_sitting']),
 dict_keys(['house_of_lords_sitting']),
 dict_keys(['lords_written_answers_sitting'])]

In [4]:
HANSARD_BASE_URL = "http://hansard.millbanksystems.com"
sitting_kind_url_fragments = {
    "house_of_commons_sitting":"/commons/",
    "westminster_hall_sitting":"/westminster_hall/",
    "commons_written_answers_sitting":"/written_answers/",
    "house_of_lords_sitting":"/lords/",
    "lords_written_answers_sitting":"/written_answers/",
}

In [5]:
# You can access sitting types through their own indicies
commons_july_20_2004 = json.loads(urlopen("http://hansard.millbanksystems.com/commons/2004/jul/20.js").read().decode('utf-8'))
# each sitting is split into a number of sections
sections_of_sitting = commons_july_20_2004[0]['house_of_commons_sitting']['top_level_sections']
# the sections contain a slug which allows access to the text of the section of sitting
sections_of_sitting[0]

{'section': {'date': '2004-07-20',
  'end_column': '135',
  'id': 3554642,
  'parent_section_id': 3554641,
  'sitting_id': 67804,
  'slug': 'preamble',
  'start_column': '135',
  'title': 'Preamble'}}

In [6]:
commons_july_20_2004_iraq_url = "http://hansard.millbanksystems.com/commons/2004/jul/20/" + sections_of_sitting[10]["section"]["slug"]
commons_july_20_2004_iraq_url

'http://hansard.millbanksystems.com/commons/2004/jul/20/iraq'

In [7]:
# the pages are not available as js, so time to parse some HTML
commons_july_20_2004_iraq = urlopen(commons_july_20_2004_iraq_url).read().decode('utf-8')

In [9]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(commons_july_20_2004_iraq)
print(soup.prettify())



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))


<!DOCTYPE doctype html>
<html lang="en-GB">
 <head>
  <meta charset="utf-8"/>
  <title>
   Iraq (Hansard, 20 July 2004)
  </title>
  <meta author="UK Parliament"/>
  <meta content="51ff727eff55314a" name="y_key"/>
  <link href="http://www.parliament.uk/site_information/parliamentary_copyright.cfm" rel="copyright"/>
  <meta content="Hansard, House of Commons, House of Lords, Parliament, UK, House of Commons sitting" name="keywords"/>
  <meta content="Iraq (Hansard, 20 July 2004)" name="description"/>
  <link href="../../../../stylesheets/screen.css" media="screen" rel="stylesheet" title="Default" type="text/css"/>
  <link href="../../../../stylesheets/print.css" media="print" rel="stylesheet" type="text/css"/>
 </head>
 <body id="hansard-millbanksytems-com">
  <div id="header">
   <div class="search-help">
    <a href="../../../../search">
     Search Help
    </a>
   </div>
   <form action="../../../../search" id="search" method="post" rel="search">
    <input accesskey="s" autosave="h

In [10]:
# TODO: use a better search here
contributions = soup.findAll("div", {"class":"hentry member_contribution"})

In [11]:
' '.join([para.text for para in contributions[0].findAll("p")])

"\n            I shall start with the Butler report and then move on to a more general discussion of Iraq. \n            I said at the outset last week that I fully accepted Lord Butler's conclusions, and there are now four things that I would like to announce as a result. First, there is an urgent need to fill the post of Chairman of the Joint Intelligence Committee and I have therefore asked Mr. William Ehrman, currently acting as a deputy chair, to take over the chairmanship of the JIC on an interim basis. He is currently director general for defence and intelligence in the Foreign Office, but he is expected to take up a further ambassadorial appointment next year. Meanwhile, the Cabinet Office will set about the task of making a permanent appointment, to take effect during 2005. That will be done fully in accordance with Lord Butler's criteria. \n            Secondly, prior to the war, meetings were held with an informal group, including the Foreign and Defence Secretaries, the Chi

In [36]:
contributions[0].findAll("cite")[0].find("a").attrs['href']

'/people/mr-tony-blair'

In [39]:
CONTRIBS = []
for contrib in contributions:
    text = ' '.join([para.text.strip() for para in contrib.findAll("p")])
    try:
        author = contrib.findAll("cite")[0].find("a").attrs['href']
    except:
        continue
    CONTRIBS.append([text, author])
    

In [40]:
import pandas as pd
sents = pd.DataFrame(CONTRIBS, columns=("text", "author"))
sents

Unnamed: 0,text,author
0,I shall start with the Butler report and then ...,/people/mr-tony-blair
1,"Before my right hon. Friend moves on, will he ...",/people/mr-harry-cohen
2,For the reasons that Lord Butler gives in his ...,/people/mr-tony-blair
3,rose—,/people/mr-alan-beith
4,rose—,/people/mr-alex-salmond
5,I shall come to the right hon. Gentleman in a ...,/people/mr-tony-blair
6,Dr. Jones minuted his concern on the matter th...,/people/mr-alan-beith
7,Without presuming exactly what the SIS and DIS...,/people/mr-tony-blair
8,rose—,/people/mr-alex-salmond
9,I am going to make some progress first. Much h...,/people/mr-tony-blair


In [42]:
# for all days that the archives have existed for.
for year in range(1803, 2006):
    for month in ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec']:
        for day in range(1, 32):
            print('http://hansard.millbanksystems.com/sittings/%s/%s/%s' % (year, month, day))

http://hansard.millbanksystems.com/sittings/1803/jan/1
http://hansard.millbanksystems.com/sittings/1803/jan/2
http://hansard.millbanksystems.com/sittings/1803/jan/3
http://hansard.millbanksystems.com/sittings/1803/jan/4
http://hansard.millbanksystems.com/sittings/1803/jan/5
http://hansard.millbanksystems.com/sittings/1803/jan/6
http://hansard.millbanksystems.com/sittings/1803/jan/7
http://hansard.millbanksystems.com/sittings/1803/jan/8
http://hansard.millbanksystems.com/sittings/1803/jan/9
http://hansard.millbanksystems.com/sittings/1803/jan/10
http://hansard.millbanksystems.com/sittings/1803/jan/11
http://hansard.millbanksystems.com/sittings/1803/jan/12
http://hansard.millbanksystems.com/sittings/1803/jan/13
http://hansard.millbanksystems.com/sittings/1803/jan/14
http://hansard.millbanksystems.com/sittings/1803/jan/15
http://hansard.millbanksystems.com/sittings/1803/jan/16
http://hansard.millbanksystems.com/sittings/1803/jan/17
http://hansard.millbanksystems.com/sittings/1803/jan/18
h