Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Vasily Ponomarev
committed
Oct 5, 2012
1 parent
1e95e03
commit ea5c84c
Showing
5 changed files
with
106 additions
and
25 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
# Ignore vim TMP files | ||
*.swp | ||
|
||
# Ignore output file | ||
zakupki.html |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
grab-zakupki-gov | ||
================ | ||
|
||
grab-zakupki-gov (gzg) processes http://zakupki.gov.ru website and collects information about goverment purchases. Currently it works for Moscow only. | ||
|
||
Details | ||
------- | ||
|
||
gzg bypasses website built-in limit for 500 entries export, and can process as many entries as website provides. | ||
|
||
The output HTML file can be opened in any web browser and searched for current and past purchasing tenders. Very useful for building goverment purchasing history trends and finding patterns. | ||
|
||
Contacts | ||
======== | ||
http://vasil-y.com | ||
|
||
https://github.com/vasily-ponomarev |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,43 +1,90 @@ | ||
#!/usr/bin/python | ||
#!/usr/bin/env python | ||
|
||
import time | ||
import random | ||
import sys | ||
from urllib import FancyURLopener | ||
from BeautifulSoup import BeautifulSoup | ||
from bs4 import BeautifulSoup | ||
|
||
# fancy User-Agent string | ||
class MyOpener(FancyURLopener): | ||
version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1' | ||
def data_cleanup(data): | ||
|
||
table['class'] = 'table table-stripped table-bordered table-hover' | ||
|
||
# Delete all images | ||
for img in data.find_all('img'): | ||
img.decompose() | ||
|
||
# Delete all JS | ||
for js in data.find_all('script'): | ||
js.decompose() | ||
|
||
# Unwrap all spans | ||
for js in data.find_all('span'): | ||
js.unwrap() | ||
|
||
# Links cleanup and add absolute paths | ||
for a in data.find_all('a'): | ||
if a.get('href') == 'javascript:' or a.get('href') is None: | ||
a.unwrap() | ||
else: | ||
a['href'] = 'http://zakupki.gov.ru' + str(a.get('href')) | ||
del a['class'] | ||
del a['style'] | ||
del a['align'] | ||
del a['onclick'] | ||
del a['onmouseout'] | ||
del a['onmouseover'] | ||
|
||
# Delete all inputs | ||
for input in data.find_all('input'): | ||
input.decompose() | ||
|
||
# Cleanup element styles | ||
for elem in data.find_all(): | ||
del elem['class'] | ||
del elem['id'] | ||
|
||
|
||
# Fancy User-Agent string | ||
class MyOpener(FancyURLopener): | ||
version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4' | ||
myopener = MyOpener() | ||
|
||
for pagenum in range(1, 4653): | ||
# Prepare report file | ||
report = open('zakupki.html', 'w') | ||
report.write(open('templates/header.tmpl').read()) | ||
|
||
# Process all pages | ||
for pagenum in range(1, 3): | ||
|
||
# fetch page | ||
print "Fetching page num " + str(pagenum) + "...", | ||
# Get page handler | ||
print 'Fetching page ' + str(pagenum) + '...', | ||
sys.stdout.flush() | ||
f = myopener.open("http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index=" | ||
f = myopener.open('http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index=' | ||
+ str(pagenum) | ||
+ "&sortField=lastEventDate&descending=true&tabName=FO&lotView=false"); | ||
html = f.read() | ||
f.close() | ||
+ '&sortField=lastEventDate&descending=true&tabName=AP&lotView=false&pageX=&pageY='); | ||
|
||
# Parse page contents | ||
doc = BeautifulSoup(f) | ||
|
||
# parse output | ||
doc = BeautifulSoup(''.join(html)) | ||
# Get table with data | ||
table = doc.find('table', { 'class' : 'searchResultTable iceDatTbl' }) | ||
|
||
# get table | ||
table = doc.find('table', { "class" : "searchResultTable iceDatTbl" }) | ||
# Data cleanup | ||
data_cleanup(table) | ||
|
||
# write to file | ||
report = open('zakupki.html', 'a') | ||
report.write(table.prettify()) | ||
# Append report to file | ||
report.write(table.prettify().encode('utf-8')) | ||
|
||
print "Done!" | ||
print 'Done!' | ||
|
||
# random pause | ||
pause = random.randint(1, 10) | ||
print "Sleeping for " + str(pause) + " seconds...", | ||
# Random pause to confuse checking tools | ||
pause = random.randint(1, 5) | ||
print 'Sleeping for ' + str(pause) + ' seconds...', | ||
sys.stdout.flush() | ||
time.sleep(pause) | ||
print "Done!" | ||
#time.sleep(pause) | ||
print 'OK' | ||
|
||
# Add footer and write | ||
report.write(open('templates/footer.tmpl').read()) | ||
report.close() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
</div> | ||
</div> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<head> | ||
<title>zakupki.gov.ru Data</title> | ||
<link href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" media="all" rel="stylesheet" type="text/css" /> | ||
<meta charset="utf-8"> | ||
</head> | ||
<body> |