# MP Expenses Project Web Scrapper
by Darren Christie
Created June 2020

This is a project notebook that supports the main MP Expenses Project. This notebook web scrapes the MP salary data off the IPSA web site.

In [148]:
# our imports for this notebook
import warnings
warnings.simplefilter('ignore', FutureWarning)

import requests
from bs4 import BeautifulSoup

from pandas import *

## The Data
The [salary data] (https://www.theipsa.org.uk/mp-costs/mps-pay-and-pensions/) has been obtained by scraping the IPSA web page. The data is then saved as a csv file in the data/processed folder ready to be picked up and used by the main notebook.

In [149]:
SALARYURL = 'https://www.theipsa.org.uk/mp-costs/mps-pay-and-pensions/'
STARTTAXYEAR = 2010
ENDTAXYEAR = 2020

In [150]:
# read in the web page from IPSA containing the salary data
r = requests.get(SALARYURL)
#print(r.text)
htmlContents = r.text
htmlSoup = BeautifulSoup(htmlContents,"html.parser")

## Process

In [151]:
# at the time of writing the first table on the page is the table we are interested in
firstTable = htmlSoup.find('table')

In [152]:
# fetch all the rows from the table, but skip the first one as that is the header which we are not interested in
rows = firstTable.findAll('tr')[1:]

In [153]:
currentYear = STARTTAXYEAR

In [154]:
mpSalary = DataFrame(columns=(['Tax Year','Salary']))

In [155]:
for row in rows:
    tempIndex = str(currentYear)+'/'+str(currentYear+1)
    salaryCol = row.findAll('td')[1:] # this skips the year column of the table, we are calculating this
    salary = salaryCol[0].text.strip() # strips any leading or trailing spaces
    mpSalary = mpSalary.append({'Tax Year': tempIndex, 'Salary': salary}, ignore_index=True)
    currentYear += 1

In [156]:
print(mpSalary)

     Tax Year    Salary
0   2010/2011  ​£65,738
1   2011/2012   £65,738
2   2012/2013   £65,738
3   2013/2014  ​£66,396
4   2014/2015  ​£67,060
5   2015/2016  ​£74,000
6   2016/2017  ​£74,962
7   2017/2018   £76,011
8   2018/2019   £77,379
9   2019/2020   £79,468
10  2020/2021   £81,932


In [157]:
# save the dataframe as a csv file
mpSalary.to_csv('data/processed/mpsalarydata.csv', index=False)