Last working cleanup approach

VincentStark · Oct 5, 2012 · ea5c84c · ea5c84c
1 parent 1e95e03
commit ea5c84c
Show file tree

Hide file tree

Showing 5 changed files with 106 additions and 25 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,5 @@
+# Ignore vim TMP files
+*.swp
+
+# Ignore output file
+zakupki.html
diff --git a/README.md b/README.md
@@ -0,0 +1,17 @@
+grab-zakupki-gov
+================
+
+grab-zakupki-gov (gzg) processes http://zakupki.gov.ru website and collects information about goverment purchases. Currently it works for Moscow only.
+
+Details
+-------
+
+gzg bypasses website built-in limit for 500 entries export, and can process as many entries as website provides.
+
+The output HTML file can be opened in any web browser and searched for current and past purchasing tenders. Very useful for building goverment purchasing history trends and finding patterns.
+
+Contacts
+========
+http://vasil-y.com
+
+https://github.com/vasily-ponomarev
diff --git a/grab-zakupki-gov.py b/grab-zakupki-gov.py
@@ -1,43 +1,90 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 import time
 import random
 import sys
 from urllib import FancyURLopener
-from BeautifulSoup import BeautifulSoup
+from bs4 import BeautifulSoup
 
-# fancy User-Agent string
-class MyOpener(FancyURLopener):
-    version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.215 Safari/535.1'
+def data_cleanup(data):
+
+  table['class'] = 'table table-stripped table-bordered table-hover'
+
+  # Delete all images
+  for img in data.find_all('img'):
+    img.decompose()
+
+  # Delete all JS
+  for js in data.find_all('script'):
+    js.decompose()
+
+  # Unwrap all spans
+  for js in data.find_all('span'):
+    js.unwrap()
+
+  # Links cleanup and add absolute paths
+  for a in data.find_all('a'):
+    if a.get('href') == 'javascript:' or a.get('href') is None:
+      a.unwrap()
+    else:
+      a['href'] = 'http://zakupki.gov.ru' + str(a.get('href'))
+      del a['class']
+      del a['style']
+      del a['align']
+      del a['onclick']
+      del a['onmouseout']
+      del a['onmouseover']
+
+  # Delete all inputs
+  for input in data.find_all('input'):
+    input.decompose()
 
+  # Cleanup element styles
+  for elem in data.find_all():
+    del elem['class']
+    del elem['id']
+
+
+# Fancy User-Agent string
+class MyOpener(FancyURLopener):
+    version = 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.4 (KHTML, like Gecko) Chrome/22.0.1229.79 Safari/537.4'
 myopener = MyOpener()
 
-for pagenum in range(1, 4653):
+# Prepare report file
+report = open('zakupki.html', 'w')
+report.write(open('templates/header.tmpl').read())
+
+# Process all pages
+for pagenum in range(1, 3):
 
-    # fetch page
-    print "Fetching page num " + str(pagenum) + "...",
+    # Get page handler
+    print 'Fetching page ' + str(pagenum) + '...',
     sys.stdout.flush()
-    f = myopener.open("http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index="
+    f = myopener.open('http://zakupki.gov.ru/pgz/public/action/search/region/result?rfSubjects=5277335&index='
         + str(pagenum)
-        + "&sortField=lastEventDate&descending=true&tabName=FO&lotView=false");
-    html = f.read()
-    f.close()
+        + '&sortField=lastEventDate&descending=true&tabName=AP&lotView=false&pageX=&pageY=');
+
+    # Parse page contents
+    doc = BeautifulSoup(f)
 
-    # parse output
-    doc = BeautifulSoup(''.join(html))
+    # Get table with data
+    table = doc.find('table', { 'class' : 'searchResultTable iceDatTbl' })
 
-    # get table
-    table = doc.find('table', { "class" : "searchResultTable iceDatTbl" })
+    # Data cleanup
+    data_cleanup(table)
 
-    # write to file
-    report = open('zakupki.html', 'a')
-    report.write(table.prettify())  
+    # Append report to file
+    report.write(table.prettify().encode('utf-8'))  
 
-    print "Done!"
+    print 'Done!'
 
-    # random pause
-    pause = random.randint(1, 10)
-    print "Sleeping for " + str(pause) + " seconds...",
+    # Random pause to confuse checking tools
+    pause = random.randint(1, 5)
+    print 'Sleeping for ' + str(pause) + ' seconds...',
     sys.stdout.flush()
-    time.sleep(pause)
-    print "Done!"
+    #time.sleep(pause)
+    print 'OK'
+
+# Add footer and write
+report.write(open('templates/footer.tmpl').read())
+report.close()
diff --git a/templates/footer.tmpl b/templates/footer.tmpl
@@ -0,0 +1,4 @@
+      </div>
+    </div>
+  </body>
+</html>
diff --git a/templates/header.tmpl b/templates/header.tmpl
@@ -0,0 +1,8 @@
+<!DOCTYPE html>
+<html>
+  <head>
+    <title>zakupki.gov.ru Data</title>
+    <link href="http://twitter.github.com/bootstrap/assets/css/bootstrap.css" media="all" rel="stylesheet" type="text/css" />
+    <meta charset="utf-8">
+  </head>
+  <body>