Updated scraper

timgdavies · Apr 23, 2017 · 6506a8d · 6506a8d
1 parent 0d7fd63
commit 6506a8d
Showing 1 changed file with 19 additions and 8 deletions.
diff --git a/scraper.py b/scraper.py
@@ -6,9 +6,8 @@
 from contextlib import closing
 import requests
 import csv
+import time
 #
-# # Read in a page
-html = scraperwiki.scrape("http://foo.com")
 #
 # # Find something on the page using css selectors
 # root = lxml.html.fromstring(html)
@@ -26,28 +25,40 @@
 # called "data.sqlite" in the current working directory which has at least a table
 # called "data".
 
+print("Scraping list")
 html = scraperwiki.scrape("http://www.gloucestershire.gov.uk/council-and-democracy/performance-and-spending/spend-over-500/")
 
 soup = BeautifulSoup(html, 'html.parser')
 
 for a in soup.find_all('a'):
     if '.csv' in a.get('href'):
         url = "http://www.gloucestershire.gov.uk" + a.get('href')
+        # url = "http://localhost:8090/" + a.get('href').split("/")[3] # For local debugging
         print("Fetching "+ url)
         try:
             with closing(requests.get(url, stream=True)) as r:
                 f = (line.decode('utf-8') for line in r.iter_lines())
-                reader = csv.DictReader(f, delimiter=',', quotechar='"')
+                header_reader = csv.reader(f)
+                headers = next(header_reader)
+                if(headers[0] != 'Service Area'): # Handle the case when the header row is missing - making a best guess
+                    headers = ['Service Area', 'BVA COP', 'Service Devison', 'Service Division Code', 'Expense Type', 'Expense Code', 'Payment Date', 'Transaction No', 'Payment Amount', 'Capital/Revenue', 'Supplier Name']
+                for i in range(0,len(headers)): # Tidy up the soup of different headers we sometimes get
+                    headers[i] = headers[i].lower().replace("1","").replace("tax number","company number").replace("/","").replace(".","").replace("&","").replace("  "," ").replace(" ","_").replace("number_","number").replace("capital_revenue","capitalrevenue").replace("revenue_capital","capitalrevenue").replace("devison","division")
+                reader = csv.DictReader(f, delimiter=',', quotechar='"',fieldnames=headers)
                 for row in reader:
-                    row['hash'] = hash(frozenset(row.items()))
-                    try: 
+                    row['transaction_ref'] = row['transaction_no'] + " - " + row["expense_code"] ## We get duplicate transaction numbers, but seem to get unique with transaction_no + expense_code
+                    try: # remove any blanks
                         del(row[''])
                     except Exception:
                         pass
                     try:
-                        scraperwiki.sqlite.save(unique_keys=['hash'],data=row,table_name='data')
-                    except Exception:
+                        scraperwiki.sqlite.save(unique_keys=['transaction_ref'],data=row,table_name='data')
+                    except Exception as e:
+                        print(e)
                         print("Failed to save row")
-        except Exception:
+        except Exception as e:
+            print(e)
             print "Failed to convert "+ url
 
+        time.sleep(1)
+