From 9fa9555bd5c5646c36baada0f6551b58871f1181 Mon Sep 17 00:00:00 2001 From: Lucia Walinchus Date: Wed, 14 Jun 2017 10:05:53 -0500 Subject: [PATCH] Create scraper.py --- scraper.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/scraper.py b/scraper.py index 458dad0..28af06d 100644 --- a/scraper.py +++ b/scraper.py @@ -115,11 +115,10 @@ def scrape_and_look_for_next_link(url): root = lxml.html.fromstring(html) scrape_table(root) global i - for i in range(1, 10): - i = (i + 1) - next_url = base_url+'GetCaseInformation.aspx?db=garfield&number=CF-2011-'+str(i) - print next_url - scrape_and_look_for_next_link(next_url) + i = (i + 1) + next_url = base_url+'GetCaseInformation.aspx?db=garfield&number=CF-2011-'+str(i) + print next_url + scrape_and_look_for_next_link(next_url) # --------------------------------------------------------------------------- # START HERE: define your starting URL - then @@ -129,9 +128,9 @@ def scrape_and_look_for_next_link(url): starting_url = urlparse.urljoin(base_url, 'GetCaseInformation.aspx?db=garfield&number=CF-2011-1') print starting_url global i -#for i in range(1,10): +for i in range(1,10): #There are 743 cases but 468 appears to be the server request limit -scrape_and_look_for_next_link(starting_url) + scrape_and_look_for_next_link(starting_url) # # Read in a page