Skip to content
Browse files

Last working cleanup approach

  • Loading branch information...
1 parent 1e95e03 commit ea5c84c3c95e23c7f779ffb074798815e489aa1a Vasily Ponomarev committed
Showing with 106 additions and 25 deletions.
  1. +5 −0 .gitignore
  2. +17 −0 README.md
  3. +72 −25 grab-zakupki-gov.py
  4. +4 −0 templates/footer.tmpl
  5. +8 −0 templates/header.tmpl
View
5 .gitignore
@@ -0,0 +1,5 @@
+# Ignore vim TMP files
+*.swp
+
+# Ignore output file
+zakupki.html
View
17 README.md
@@ -0,0 +1,17 @@
+grab-zakupki-gov
+================
+
+grab-zakupki-gov (gzg) processes http://zakupki.gov.ru website and collects information about goverment purchases. Currently it works for Moscow only.
+
+Details
+-------
+
+gzg bypasses website built-in limit for 500 entries export, and can process as many entries as website provides.
+
+The output HTML file can be opened in any web browser and searched for current and past purchasing tenders. Very useful for building goverment purchasing history trends and finding patterns.
+
+Contacts
+========
+http://vasil-y.com
+
+https://github.com/vasily-ponomarev
View
97 grab-zakupki-gov.py
@@ -1,43 +1,90 @@
-#!/usr/bin/python
+#!/usr/bin/env python
import time
import random
import sys
from urllib import FancyURLopener
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
-# fancy User-Agent string
-class MyOpener(FancyURLopener):
- version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1'
+def data_cleanup(data):
+
+ table['class'] = 'table table-stripped table-bordered table-hover'
+
+ # Delete all images
+ for img in data.find_all('img'):
+ img.decompose()
+
+ # Delete all JS
+ for js in data.find_all('script'):
+ js.decompose()
+
+ # Unwrap all spans
+ for js in data.find_all('span'):
+ js.unwrap()
+
+ # Links cleanup and add absolute paths
+ for a in data.find_all('a'):
+ if a.get('href') == 'javascript:' or a.get('href') is None:
+ a.unwrap()
+ else:
+ a['href'] = 'http://zakupki.gov.ru' + str(a.get('href'))
+ del a['class']
+ del a['style']
+ del a['align']
+ del a['onclick']
+ del a['onmouseout']
+ del a['onmouseover']
+
+ # Delete all inputs
+ for input in data.find_all('input'):
+ input.decompose()
+ # Cleanup element styles
+ for elem in data.find_all():
+ del elem['class']
+ del elem['id']
+
+
+# Fancy User-Agent string
+class MyOpener(FancyURLopener):
+ version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4'
myopener = MyOpener()
-for pagenum in range(1, 4653):
+# Prepare report file
+report = open('zakupki.html', 'w')
+report.write(open('templates/header.tmpl').read())
+
+# Process all pages
+for pagenum in range(1, 3):
- # fetch page
- print "Fetching page num " + str(pagenum) + "...",
+ # Get page handler
+ print 'Fetching page ' + str(pagenum) + '...',
sys.stdout.flush()
- f = myopener.open("http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index="
+ f = myopener.open('http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index='
+ str(pagenum)
- + "&sortField=lastEventDate&descending=true&tabName=FO&lotView=false");
- html = f.read()
- f.close()
+ + '&sortField=lastEventDate&descending=true&tabName=AP&lotView=false&pageX=&pageY=');
+
+ # Parse page contents
+ doc = BeautifulSoup(f)
- # parse output
- doc = BeautifulSoup(''.join(html))
+ # Get table with data
+ table = doc.find('table', { 'class' : 'searchResultTable iceDatTbl' })
- # get table
- table = doc.find('table', { "class" : "searchResultTable iceDatTbl" })
+ # Data cleanup
+ data_cleanup(table)
- # write to file
- report = open('zakupki.html', 'a')
- report.write(table.prettify())
+ # Append report to file
+ report.write(table.prettify().encode('utf-8'))
- print "Done!"
+ print 'Done!'
- # random pause
- pause = random.randint(1, 10)
- print "Sleeping for " + str(pause) + " seconds...",
+ # Random pause to confuse checking tools
+ pause = random.randint(1, 5)
+ print 'Sleeping for ' + str(pause) + ' seconds...',
sys.stdout.flush()
- time.sleep(pause)
- print "Done!"
+ #time.sleep(pause)
+ print 'OK'
+
+# Add footer and write
+report.write(open('templates/footer.tmpl').read())
+report.close()
View
4 templates/footer.tmpl
@@ -0,0 +1,4 @@
+ </div>
+ </div>
+ </body>
+</html>
View
8 templates/header.tmpl
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+ <head>
+ <title>zakupki.gov.ru Data</title>
+ <link href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" media="all" rel="stylesheet" type="text/css" />
+ <meta charset="utf-8">
+ </head>
+ <body>

0 comments on commit ea5c84c

Please sign in to comment.
Something went wrong with that request. Please try again.