Skip to content

Commit

Permalink
Last working cleanup approach
Browse files Browse the repository at this point in the history
  • Loading branch information
Vasily Ponomarev committed Oct 5, 2012
1 parent 1e95e03 commit ea5c84c
Show file tree
Hide file tree
Showing 5 changed files with 106 additions and 25 deletions.
5 changes: 5 additions & 0 deletions .gitignore
@@ -0,0 +1,5 @@
# Ignore vim TMP files
*.swp

# Ignore output file
zakupki.html
17 changes: 17 additions & 0 deletions README.md
@@ -0,0 +1,17 @@
grab-zakupki-gov
================

grab-zakupki-gov (gzg) processes http://zakupki.gov.ru website and collects information about goverment purchases. Currently it works for Moscow only.

Details
-------

gzg bypasses website built-in limit for 500 entries export, and can process as many entries as website provides.

The output HTML file can be opened in any web browser and searched for current and past purchasing tenders. Very useful for building goverment purchasing history trends and finding patterns.

Contacts
========
http://vasil-y.com

https://github.com/vasily-ponomarev
97 changes: 72 additions & 25 deletions grab-zakupki-gov.py
@@ -1,43 +1,90 @@
#!/usr/bin/python
#!/usr/bin/env python

import time
import random
import sys
from urllib import FancyURLopener
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup

# fancy User-Agent string
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1'
def data_cleanup(data):

table['class'] = 'table table-stripped table-bordered table-hover'

# Delete all images
for img in data.find_all('img'):
img.decompose()

# Delete all JS
for js in data.find_all('script'):
js.decompose()

# Unwrap all spans
for js in data.find_all('span'):
js.unwrap()

# Links cleanup and add absolute paths
for a in data.find_all('a'):
if a.get('href') == 'javascript:' or a.get('href') is None:
a.unwrap()
else:
a['href'] = 'http://zakupki.gov.ru' + str(a.get('href'))
del a['class']
del a['style']
del a['align']
del a['onclick']
del a['onmouseout']
del a['onmouseover']

# Delete all inputs
for input in data.find_all('input'):
input.decompose()

# Cleanup element styles
for elem in data.find_all():
del elem['class']
del elem['id']


# Fancy User-Agent string
class MyOpener(FancyURLopener):
version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4'
myopener = MyOpener()

for pagenum in range(1, 4653):
# Prepare report file
report = open('zakupki.html', 'w')
report.write(open('templates/header.tmpl').read())

# Process all pages
for pagenum in range(1, 3):

# fetch page
print "Fetching page num " + str(pagenum) + "...",
# Get page handler
print 'Fetching page ' + str(pagenum) + '...',
sys.stdout.flush()
f = myopener.open("http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index="
f = myopener.open('http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index='
+ str(pagenum)
+ "&sortField=lastEventDate&descending=true&tabName=FO&lotView=false");
html = f.read()
f.close()
+ '&sortField=lastEventDate&descending=true&tabName=AP&lotView=false&pageX=&pageY=');

# Parse page contents
doc = BeautifulSoup(f)

# parse output
doc = BeautifulSoup(''.join(html))
# Get table with data
table = doc.find('table', { 'class' : 'searchResultTable iceDatTbl' })

# get table
table = doc.find('table', { "class" : "searchResultTable iceDatTbl" })
# Data cleanup
data_cleanup(table)

# write to file
report = open('zakupki.html', 'a')
report.write(table.prettify())
# Append report to file
report.write(table.prettify().encode('utf-8'))

print "Done!"
print 'Done!'

# random pause
pause = random.randint(1, 10)
print "Sleeping for " + str(pause) + " seconds...",
# Random pause to confuse checking tools
pause = random.randint(1, 5)
print 'Sleeping for ' + str(pause) + ' seconds...',
sys.stdout.flush()
time.sleep(pause)
print "Done!"
#time.sleep(pause)
print 'OK'

# Add footer and write
report.write(open('templates/footer.tmpl').read())
report.close()
4 changes: 4 additions & 0 deletions templates/footer.tmpl
@@ -0,0 +1,4 @@
</div>
</div>
</body>
</html>
8 changes: 8 additions & 0 deletions templates/header.tmpl
@@ -0,0 +1,8 @@
<!DOCTYPE html>
<html>
<head>
<title>zakupki.gov.ru Data</title>
<link href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" media="all" rel="stylesheet" type="text/css" />
<meta charset="utf-8">
</head>
<body>

0 comments on commit ea5c84c

Please sign in to comment.