Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiblix100 committed May 21, 2017
1 parent 49e5500 commit 8c5a350
Showing 1 changed file with 20 additions and 20 deletions.
40 changes: 20 additions & 20 deletions scraper.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
###############################################################################
# START HERE: Tutorial for scraping pages behind form, using the
# very powerful Mechanize library. Documentation is here:
# http://wwwsearch.sourceforge.net/mechanize/
###############################################################################
import mechanize
import scraperwiki
import urllib
import lxml.html

lotterygrantsurl = "http://www.lottery.culture.gov.uk/AdvancedSearch.aspx"
def scrape_links(root):
thumbimgs = root.cssselect("div.thumb-image-box")
for thumbimg in thumbimgs:
thumblink = thumbimg.cssselect("a")
if thumblink:
thumblinks = thumblink[0].attrib.get("href")
thumburl = baseurl+thumblinks
print thumburl




br = mechanize.Browser()
response = br.open(lotterygrantsurl)
starting_url = "http://medcell.med.yale.edu/image_gallery/home.php"
baseurl = "http://medcell.med.yale.edu/image_gallery/"

print "All forms:", [ form.name for form in br.forms() ]
def scrape_and_look_for_next_link(url):
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
scrape_links(root)

br.select_form(name="aspnetForm")
print br.form

br["ctl00$phMainContent$dropDownAwardDate"] = ["Between"]
br["ctl00$phMainContent$txtGrantDateFrom"] = "01/01/2004"
br["ctl00$phMainContent$txtGrantDateTo"] = "20/01/2004"
print br

response = br.submit()
print response.read()
scrape_and_look_for_next_link(starting_url)

0 comments on commit 8c5a350

Please sign in to comment.