Skip to content

Commit

Permalink
Update scraper.py
Browse files Browse the repository at this point in the history
  • Loading branch information
yngveny committed Sep 25, 2017
1 parent 8afb70a commit d0dfb51
Showing 1 changed file with 9 additions and 14 deletions.
23 changes: 9 additions & 14 deletions scraper.py
Expand Up @@ -7,25 +7,20 @@
def scrape_dof(url):
html = scraperwiki.scrape(url)

#print html
root = lxml.html.fromstring(html)
#print root.find_class("div.notice-search-item")


#line below selects all <div class="notice-search-item">
rows = root.cssselect("div.notice-search-item")

for row in rows:
#print(row.text_content().encode("utf-8"))
#print(row.classes())
# Set up our data record - we'll need it later
record = {}
#a = row.cssselect("a") #grab all <a> tags within our <div>
#title = a[0].text

n = 0
for div in row.cssselect("div"):
print(n)
print(div.text_content().encode("utf-8"))
n = n+1

#n = 0
#for div in row.cssselect("div"):
# print(n)
# print(div.text_content().encode("utf-8"))
# n = n+1

element = row.cssselect("div")
title = element[1].text_content()
Expand All @@ -44,7 +39,7 @@ def scrape_dof(url):
# Finally, save the record to the datastore - 'Name' is our unique key
scraperwiki.sqlite.save(["Dofref"], record)

doflist = ['www.doffin.no/Notice?query=&PageNumber=1&PageSize=10&OrderingType=0&OrderingDirection=1&RegionId=&CountyId=&MunicipalityId=&IsAdvancedSearch=false&location=&NoticeType=3&PublicationType=&IncludeExpired=false&Cpvs=&EpsReferenceNr=&DeadlineFromDate=&DeadlineToDate=&PublishedFromDate=&PublishedToDate=']
doflist = ['www.doffin.no/Notice?query=&PageNumber=1&PageSize=20&OrderingType=0&OrderingDirection=1&RegionId=&CountyId=&MunicipalityId=&IsAdvancedSearch=false&location=&NoticeType=3&PublicationType=&IncludeExpired=false&Cpvs=&EpsReferenceNr=&DeadlineFromDate=&DeadlineToDate=&PublishedFromDate=&PublishedToDate=']
for url in doflist:
fullurl = 'http://'+url
print 'scraping ', fullurl
Expand Down

0 comments on commit d0dfb51

Please sign in to comment.