Permalink
Browse files

parser

  • Loading branch information...
tlevine committed Jun 16, 2013
1 parent 6f53757 commit 9edbc5a16dc74056f283eedd442feffbb92ef8cb
Showing with 59 additions and 6 deletions.
  1. +0 −5 reader/src/finalip-parse.py
  2. +3 −1 reader/src/finalip_lib.py
  3. +31 −0 reader/src/finalip_parse.py
  4. +25 −0 reader/src/test_finalip_lib.py
@@ -1,5 +0,0 @@
-#!/usr/bin/env python2
-
-trs = html.xpath('//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr')
-
-data = map(parse_row, trs[2:])
@@ -1,10 +1,12 @@
#!/usr/bin/env python2
+from collections import OrderedDict
def parse_row(tr):
- row = {unicode(td.xpath('@headers')[0]): unicode(td.text_content()) for td in tr.xpath('td[@headers!="Map"]')}
+ row = OrderedDict([(unicode(td.xpath('@headers')[0]), unicode(td.text_content())) for td in tr.xpath('td[@headers!="Map"]')])
row[u'Map'] = unicode(tr.xpath('descendant::td[@headers="Map"]/a/@href')[0])
return row
+
def apex_submit(meta_session, p_t03, p_t04):
session, response, html = meta_session
url = 'http://geo.usace.army.mil/egis/wwv_flow.accept'
@@ -0,0 +1,31 @@
+#!/usr/bin/env python2
+import os
+
+from lxml.html import parse
+from dumptruck import DumpTruck
+
+import finalip_lib as l
+
+def read_finalip(path):
+ html = parse(path)
+ trs = html.xpath('//table[@style="border-collapse: collapse; width: 100%;"]/descendant::tr')
+ return map(l.parse_row, trs[2:])
+
+# Schema
+dt = Dumptruck(dbname = '/tmp/finalip.db')
+dt.create_table({u'DA Number': u'NAE-2009-01067'}, 'finalip', if_not_exists = True)
+dt.create_index(['Da Number'], unique = True, if_not_exists = True)
+
+# Populate
+for dirname, subdirnames, filenames in os.walk(os.path.join(os.environ['READER_ROOT'], '..', 'finalips')):
+ if subdirs != []:
+ continue
+ for filename in filenames:
+ year, month = map(int, dirname.split('/')[-2:])
+ data = read_finalip(os.path.join(dirname, filename))
+ for row in data:
+ row['Year'] = year
+ row['Month'] = month
+ row['Page'] = filename
+ dt.upsert(data, 'finalip')
+
@@ -0,0 +1,25 @@
+from collections import OrderedDict
+import datetime
+
+from lxml.html import fromstring
+import nose.tools as n
+
+import finalip_lib as l
+
+def test_parse_row():
+ tr = fromstring('''<tr class="ui-widget-content jqgrow ui-row-ltr"><td headers="District">New England</td><td headers="DA Number">NAE-2009-01067</td><td headers="Applicant">Joseph Sullivan-Tri-Town Board of Water Com</td><td headers="Project Name">Tri-Town Board of Water Com / Braintree & Randolph, MA</td><td headers="Permit Type">Standard Permit</td><td headers="Public Notice Date">05-OCT-2009</td><td headers="Action Taken">Issued With Special Conditions</td><td headers="Date Issued\Denied">03-FEB-2010</td><td align="center" headers="Map"><a href = "f?p=340:7:283770440941901::NO::P7_PROJECT_ID:4431690"><img src = "wwv_flow_file_mgr.get_file?p_security_group_id=1211711255363293&p_fname=map_icon.gif" border="0" alt="View on Map"></a></td></tr>''')
+ observed = l.parse_row(tr)
+ expected = OrderedDict([
+ (u'District', u'New England'),
+ (u'DA Number', u'NAE-2009-01067'),
+ (u'Applicant', u'Joseph Sullivan-Tri-Town Board of Water Com'),
+ (u'Project Name', u'Tri-Town Board of Water Com / Braintree & Randolph, MA'),
+ (u'Permit Type', u'Standard Permit'),
+ (u'Public Notice Date', datetime.date(2009, 10, 5)),
+ (u'Action Taken', u'Issued With Special Conditions'),
+ (u'Date Issued\\Denied', datetime.date(2010, 2, 3)),
+ (u'Map', u'f?p=340:7:283770440941901::NO::P7_PROJECT_ID:4431690'),
+ ])
+ n.assert_list_equal(observed.keys(),expected.keys())
+ for k in observed.keys():
+ n.assert_equal(observed[k], expected[k])

0 comments on commit 9edbc5a

Please sign in to comment.