Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Newer
Older
100755 87 lines (66 sloc) 3.092 kB
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
1 #!/usr/bin/env python
548f3f6 Initial commit
Vasily Ponomarev authored
2
3 import time
4 import random
5 import sys
6 from urllib import FancyURLopener
d592bf4 Different approach - entries processing
Vasily Ponomarev authored
7 from jinja2 import Environment, PackageLoader
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
8 from bs4 import BeautifulSoup
548f3f6 Initial commit
Vasily Ponomarev authored
9
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
10 # Fancy User-Agent string
11 class MyOpener(FancyURLopener):
12 version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4'
548f3f6 Initial commit
Vasily Ponomarev authored
13 myopener = MyOpener()
14
d592bf4 Different approach - entries processing
Vasily Ponomarev authored
15 # Prepare template
16 env = Environment(loader=PackageLoader('__main__', 'templates'))
17 template = env.get_template('zakupki.tmpl')
18
19 # Prepare array for data storage
20 entries = []
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
21
22 # Process all pages
c4bbe02 Minor fixes
Vasily Ponomarev authored
23 for pagenum in range(1, 100):
548f3f6 Initial commit
Vasily Ponomarev authored
24
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
25 # Get page handler
26 print 'Fetching page ' + str(pagenum) + '...',
548f3f6 Initial commit
Vasily Ponomarev authored
27 sys.stdout.flush()
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
28 f = myopener.open('http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index='
548f3f6 Initial commit
Vasily Ponomarev authored
29 + str(pagenum)
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
30 + '&sortField=lastEventDate&descending=true&tabName=AP&lotView=false&pageX=&pageY=');
d592bf4 Different approach - entries processing
Vasily Ponomarev authored
31 print 'Done!'
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
32
33 # Parse page contents
34 doc = BeautifulSoup(f)
548f3f6 Initial commit
Vasily Ponomarev authored
35
d592bf4 Different approach - entries processing
Vasily Ponomarev authored
36 # Get content
37 table = doc.find('table', 'searchResultTable iceDatTbl')
38
39 for row in table.find_all('tr', 'searchResultTableRow'):
40
41 entry = {}
42
43 # Extract description type
44 type = row.find('span', 'blueBold')
45 entry['description_type'] = type.string.strip()
46
47 # Extract description number and link
48 entry['description_number_href'] = type.parent.a.get('href')
49 entry['description_number'] = type.parent.a.span.string.strip()
50
51 # Extract description text and link
52 entry['description_text_href'] = type.parent.parent.parent.select('.iceOutLnk')[0].get('href')
53 entry['description_text'] = type.parent.parent.parent.select('.iceOutLnk')[0].string.strip()
54
55 # Extract description org text and link
56 entry['description_org_href'] = type.parent.parent.parent.select('.iceCmdLnk')[1].get('href')
57 entry['description_org'] = type.parent.parent.parent.select('.iceCmdLnk')[1].span.string.strip()
58
59 # Extract published and updated
60 dates = row.find_all('td', 'iceDatTblCol searchResultTableCol searchResultColumn tableColumn70')
61 entry['published'] = dates[0].span.string.strip()
62 entry['updated_href'] = dates[1].a.get('href')
63 entry['updated'] = dates[1].a.span.string.strip()
64
65 # Extract price
66 entry['price'] = row.find('td', 'iceDatTblCol searchResultTableCol searchResultColumn tableColumn105').span.string.strip()
67
68 # Extract additional info
69 entry['additional'] = u''
70 for link in row.find('table', 'tableColumn70').find_all('a'):
71 entry['additional'] += '<a href="http://zakupki.gov.ru' + link.get('href') + '" target="_blank">' + link.span.string + '</a><br/>'
72
d572e37 RC1
Vasily Ponomarev authored
73 # Add new entry
d592bf4 Different approach - entries processing
Vasily Ponomarev authored
74 entries.append(entry)
548f3f6 Initial commit
Vasily Ponomarev authored
75
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
76 # Random pause to confuse checking tools
d572e37 RC1
Vasily Ponomarev authored
77 pause = random.randint(1, 2)
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
78 print 'Sleeping for ' + str(pause) + ' seconds...',
548f3f6 Initial commit
Vasily Ponomarev authored
79 sys.stdout.flush()
d572e37 RC1
Vasily Ponomarev authored
80 time.sleep(pause)
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
81 print 'OK'
82
d592bf4 Different approach - entries processing
Vasily Ponomarev authored
83 # Output
84 report = open('zakupki.html', 'w')
85 report.write(template.render(entries = entries).encode('utf-8'))
ea5c84c Last working cleanup approach
Vasily Ponomarev authored
86 report.close()
Something went wrong with that request. Please try again.