Update scraper.py

yngveny · Sep 25, 2017 · d0dfb51 · d0dfb51
1 parent 8afb70a
commit d0dfb51
Showing 1 changed file with 9 additions and 14 deletions.
diff --git a/scraper.py b/scraper.py
@@ -7,25 +7,20 @@
 def scrape_dof(url):
     html = scraperwiki.scrape(url)
 
-    #print html
     root = lxml.html.fromstring(html)
-    #print root.find_class("div.notice-search-item")
-
+
     #line below selects all <div class="notice-search-item">
     rows = root.cssselect("div.notice-search-item")
+
     for row in rows:
-        #print(row.text_content().encode("utf-8"))
-        #print(row.classes())
         # Set up our data record - we'll need it later
         record = {}
-        #a = row.cssselect("a") #grab all <a> tags within our <div>
-        #title = a[0].text
-
-        n = 0
-        for div in row.cssselect("div"):
-            print(n)
-            print(div.text_content().encode("utf-8"))        
-            n = n+1
+
+        #n = 0
+        #for div in row.cssselect("div"):
+        #    print(n)
+        #    print(div.text_content().encode("utf-8"))        
+        #    n = n+1
 
         element = row.cssselect("div")
         title = element[1].text_content()
@@ -44,7 +39,7 @@ def scrape_dof(url):
         # Finally, save the record to the datastore - 'Name' is our unique key
         scraperwiki.sqlite.save(["Dofref"], record)
 
-doflist = ['www.doffin.no/Notice?query=&PageNumber=1&PageSize=10&OrderingType=0&OrderingDirection=1&RegionId=&CountyId=&MunicipalityId=&IsAdvancedSearch=false&location=&NoticeType=3&PublicationType=&IncludeExpired=false&Cpvs=&EpsReferenceNr=&DeadlineFromDate=&DeadlineToDate=&PublishedFromDate=&PublishedToDate=']
+doflist = ['www.doffin.no/Notice?query=&PageNumber=1&PageSize=20&OrderingType=0&OrderingDirection=1&RegionId=&CountyId=&MunicipalityId=&IsAdvancedSearch=false&location=&NoticeType=3&PublicationType=&IncludeExpired=false&Cpvs=&EpsReferenceNr=&DeadlineFromDate=&DeadlineToDate=&PublishedFromDate=&PublishedToDate=']
 for url in doflist:
     fullurl = 'http://'+url
     print 'scraping ', fullurl