/
scraper.py
51 lines (40 loc) · 1.54 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# encoding: utf-8
import scraperwiki
import urlparse
import lxml.html
# create a new function, which gets passed a variable we're going to call 'url'
def scrape_dof(url):
html = scraperwiki.scrape(url)
root = lxml.html.fromstring(html)
#line below selects all <div class="notice-search-item">
rows = root.cssselect("div.notice-search-item")
for row in rows:
# Set up our data record - we'll need it later
record = {}
#n = 0
#for a in row.cssselect("a"):
# print(n)
# print(a.get('href'))
# n = n+1
link = row.cssselect("a")
link1 = link[0].get('href')
element = row.cssselect("div")
title = element[1].text_content()
klient = element[3].text_content()
kgtype = element[4].text_content()
dofref = element[6].text_content()
kgdato = element[7].text_content()
record['DofRef'] = dofref
record['Title'] = title
record['Klient'] = klient
record['Kungj_type'] = kgtype
record['Kungj_dato'] = kgdato
record['Link'] = link1
# Finally, save the record to the datastore - 'Name' is our unique key
scraperwiki.sqlite.save(["Dofref"], record)
doflist = ['www.doffin.no/Notice?query=&PageNumber=1&PageSize=20&OrderingType=0&OrderingDirection=1&NoticeType=3&IncludeExpired=false']
for url in doflist:
fullurl = 'http://'+url
print 'scraping ', fullurl
scrape_dof(fullurl)
print 'and done'