Skip to content
Permalink
Browse files

first port

  • Loading branch information
woodbine committed Apr 22, 2015
1 parent cc869b2 commit 3b90fc5db6590e17e3cb3f5db2ade75809f01947
Showing with 38 additions and 21 deletions.
  1. +38 −21 scraper.py
@@ -1,23 +1,40 @@
# This is a template for a Python scraper on morph.io (https://morph.io)
# including some code snippets below that you should find helpful
# -*- coding: utf-8 -*-

# import scraperwiki
# import lxml.html
#
# # Read in a page
# html = scraperwiki.scrape("http://foo.com")
#
# # Find something on the page using css selectors
# root = lxml.html.fromstring(html)
# root.cssselect("div[align='left']")
#
# # Write out to the sqlite database using scraperwiki library
# scraperwiki.sqlite.save(unique_keys=['name'], data={"name": "susan", "occupation": "software developer"})
#
# # An arbitrary query against the database
# scraperwiki.sql.select("* from data where 'name'='peter'")
import scraperwiki
import urllib2
from datetime import datetime
from bs4 import BeautifulSoup

# You don't have to do things with the ScraperWiki and lxml libraries. You can use whatever libraries are installed
# on morph.io for Python (https://github.com/openaustralia/morph-docker-python/blob/master/pip_requirements.txt) and all that matters
# is that your final data is written to an Sqlite database called data.sqlite in the current working directory which
# has at least a table called data.
# Set up variables
entity_id = "E2221_KCC_gov"
url = "http://www.kent.gov.uk/about-the-council/finance-and-budget/spending/invoices-over-250"

# Set up functions
def convert_mth_strings ( mth_string ):
month_numbers = {'JAN': '01', 'FEB': '02', 'MAR':'03', 'APR':'04', 'MAY':'05', 'JUN':'06', 'JUL':'07', 'AUG':'08', 'SEP':'09','OCT':'10','NOV':'11','DEC':'12' }
#loop through the months in our dictionary
for k, v in month_numbers.items():
#then replace the word with the number
mth_string = mth_string.replace(k, v)
return mth_string

# pull down the content from the webpage
html = urllib2.urlopen(url)
soup = BeautifulSoup(html)

# find all entries with the required class
links = soup.findAll('a', href=True)

for link in links:
url = link['href']
if '.csv' in url:
title = url.split('/')[-1]
# create the right strings for the new filename
csvYr = title.split('-')[2]
csvMth = title.split('-')[1][:3]
csvMth = csvMth.upper()
csvMth = convert_mth_strings(csvMth);
filename = entity_id + "_" + csvYr + "_" + csvMth + ".csv"
todays_date = str(datetime.now())
scraperwiki.sqlite.save(unique_keys=['l'], data={"l": url, "f": filename, "d": todays_date })
print filename

0 comments on commit 3b90fc5

Please sign in to comment.
You can’t perform that action at this time.