/
scraper.py
50 lines (43 loc) · 1.92 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# encoding: utf-8
import scraperwiki
import urlparse
import lxml.html
# create a new function, which gets passed a variable we're going to call 'url'
def scrape_dof(url):
html = scraperwiki.scrape(url)
#print html
root = lxml.html.fromstring(html)
#print root.find_class("div.notice-search-item")
#line below selects all <div class="notice-search-item">
rows = root.cssselect("div.notice-search-item")
for row in rows:
#print(row.text_content().encode("utf-8"))
#print(row.classes())
# Set up our data record - we'll need it later
record = {}
a = row.cssselect("a") #grab all <a> tags within our <div>
title = a[0].text
link = a[0].link
#print(a[0].text.encode("utf-8"))
header = row.cssselect("div.notice-search-item-header")
title2 = header[0].text
#item_left = row.cssselect("div")
#company = item_left[0].text
#print(item_left[0].text.encode("utf-8"))
#repeat process for <span class="right-col">
#item_right = row.cssselect("div.right-col")
#ref = item_right[0].text
#date = item_right[1].text
#record['URL'] = url
record['Title'] = title
record['Link'] = link
#record['Reference'] = ref
#record['Company'] = company
#print record, '------------'
# Finally, save the record to the datastore - 'Name' is our unique key
scraperwiki.sqlite.save(["Title"], record)
doflist = ['www.doffin.no/Notice?query=&PageNumber=1&PageSize=30&OrderingType=0&OrderingDirection=1&RegionId=&CountyId=&MunicipalityId=&IsAdvancedSearch=false&location=&NoticeType=3&PublicationType=&IncludeExpired=false&Cpvs=&EpsReferenceNr=&DeadlineFromDate=&DeadlineToDate=&PublishedFromDate=&PublishedToDate=']
for url in doflist:
fullurl = 'http://'+url
print 'scraping ', fullurl
scrape_dof(fullurl)