Dev/issue 41 (#42)

* replace logging.warn (deprecated) with logging.warning * cosmetic changes/PEP-8 * Fix #41 * retain backwards compatibility * retain backwards compatibility (for 11 more days lol) * increment version number * SSOT for version
unitedstates · Dec 20, 2019 · e71392b · e71392b
1 parent ad6b222
commit e71392b
Show file tree

Hide file tree

Showing 5 changed files with 46 additions and 31 deletions.
diff --git a/congressionalrecord/__init__.py b/congressionalrecord/__init__.py
@@ -1 +1,4 @@
-__version__ = '2.0.1'
+import pkg_resources  # part of setuptools
+VERSION = pkg_resources.require("congressionalrecord")[0].version
+
+__version__ = VERSION
diff --git a/congressionalrecord/govinfo/cr_parser.py b/congressionalrecord/govinfo/cr_parser.py
@@ -10,6 +10,7 @@
 import logging
 import itertools
 
+
 class ParseCRDir(object):
 
     def gen_dir_metadata(self):
@@ -25,7 +26,8 @@ def __init__(self, abspath, **kwargs):
         self.mods_path = os.path.join(self.cr_dir,'mods.xml')
         self.html_path = os.path.join(self.cr_dir,'html')
         self.gen_dir_metadata()
-
+
+
 class ParseCRFile(object):
     # Some regex
     re_time = r'^CREC-(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})-.*'
@@ -204,7 +206,7 @@ def gen_file_metadata(self):
         if matchobj:
             self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
         else:
-            logging.warn('{0} yields no title, vol, num'.format(
+            logging.warning('{0} yields no title, vol, num'.format(
                 self.access_path))
             self.doc_title, self.cr_vol, self.cr_num = \
               'None','Unknown','Unknown'
@@ -343,7 +345,7 @@ def write_page(self):
                 itemno += 1
                 the_content.append(item)
             except Exception as e:
-                logging.warn('{0}'.format(e))
+                logging.warning('{0}'.format(e))
                 break
 
         self.crdoc['content'] = the_content

diff --git a/congressionalrecord/govinfo/downloader.py b/congressionalrecord/govinfo/downloader.py
@@ -1,19 +1,23 @@
 from __future__ import absolute_import
+import os
+import json
+import logging
+import pkg_resources  # part of setuptools
+import certifi
+import urllib3.contrib.pyopenssl
 #import requests
 from builtins import str
 from builtins import object
-import certifi
-import urllib3.contrib.pyopenssl
-urllib3.contrib.pyopenssl.inject_into_urllib3()
 from urllib3 import PoolManager, Retry, Timeout
-import os
 from datetime import datetime, date, timedelta
+from io import BytesIO
 from time import sleep
-from zipfile import ZipFile
+from zipfile import ZipFile, BadZipfile
 from .cr_parser import ParseCRDir, ParseCRFile
-import json
 from pyelasticsearch import ElasticSearch, bulk_chunks
-import logging
+
+urllib3.contrib.pyopenssl.inject_into_urllib3()
+VERSION = pkg_resources.require("congressionalrecord")[0].version
 
 
 class Downloader(object):
@@ -59,7 +63,6 @@ def bulkdownload(self, start, parse=True, **kwargs):
                 logging.warning('Unexpected condition in bulkdownloader')
             day += timedelta(days=1)
 
-
     def __init__(self, start, **kwargs):
         """
         Invoke a Downloader object to get data from
@@ -140,12 +143,10 @@ def __init__(self, start, **kwargs):
             return None
 
 
-
-
-
 class downloadRequest(object):
 
-    user_agent = {'user-agent': 'congressional-record 0.0.1 (https://github.com/unitedstates/congressional-record)'}
+    user_agent = {'user-agent':
+                  'congressional-record {} (https://github.com/unitedstates/congressional-record)'.format(VERSION)}
     its_today = datetime.strftime(datetime.today(), '%Y-%m-%d %H:%M')
     timeout = Timeout(connect=2.0, read=10.0)
     retry = Retry(total=3, backoff_factor=300)
@@ -162,19 +163,26 @@ def __init__(self, url, filename):
             r = self.http.request('GET', url)
             logging.debug('Request headers received with code {}'.format(r.status))
             if r.status == 404:
-                logging.warn('Received 404, not retrying request.')
+                logging.warning('Received 404, not retrying request.')
                 self.status = 404
             elif r.status == 200 and r.data:
-                logging.info('Considering request successful.')
-                self.binary_content = r.data
-                self.status = True
+                logging.info('Considering download request successful.')
+                logging.info('Sniff sniff: Does this smell like a ZIP file?')
+                with BytesIO(r.data) as thepackage:
+                    try:
+                        isazip = ZipFile(thepackage)
+                        self.binary_content = r.data
+                        self.status = True
+                    except BadZipfile:
+                        logging.warning('File {} is not a valid ZIP file (BadZipFile)'.format(url))
+                        self.status = False
             else:
-                logging.warn('Unexpected condition, not continuing:\
+                logging.warning('Unexpected condition, not continuing:\
                 {}'.format(r.status))
         except urllib3.exceptions.MaxRetryError as ce:
-            logging.warn('Error: %s - Aborting download' % ce)
+            logging.warning('Error: %s - Aborting download' % ce)
         if self.status == False:
-            logging.warn('Failed to download file {}'.format(url))
+            logging.warning('Failed to download file {}'.format(url))
         elif self.status == 404:
             logging.info('downloadRequester skipping file that returned 404.')
         elif self.binary_content:
@@ -199,7 +207,7 @@ def download_day(self, day, outpath):
         the_download = downloadRequest(the_url, the_filename)
         self.status = the_download.status
         if self.status == False:
-            logging.warn("fdsysDL received report that download for {} did not complete.".format(day))
+            logging.warning("fdsysDL received report that download for {} did not complete.".format(day))
         elif self.status == 404:
             logging.warning('fdsysDL received 404 report for {}.'.format(day))
         else:
@@ -213,6 +221,7 @@ def __init__(self, day, **kwargs):
             self.outpath = 'output'
         self.download_day(day, self.outpath)
 
+
 class GovInfoExtract(object):
 
     def __init__(self, day, **kwargs):

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='congressionalrecord',
-    version='2.0.1',
+    version='2.0.2',
     description='Parse the U.S. Congressional Record from GovInfo.',
     url='https://github.com/unitedstates/congressional-record',
     author='Nick Judd',

diff --git a/tests/test_downloader.py b/tests/test_downloader.py
@@ -8,18 +8,19 @@
 import re
 import logging
 
-logging.basicConfig(filename='tests.log',level=logging.DEBUG)
+logging.basicConfig(filename='tests.log', level=logging.DEBUG)
+
 
 class testDownloader(unittest.TestCase):
 
     def test_handle_404(self):
-        download = dl.Downloader('2015-07-19',do_mode='json')
-        self.assertEqual(download.status,'downloadFailure')
+        download = dl.Downloader('2015-07-19', do_mode='json')
+        self.assertEqual(download.status, 'downloadFailure')
 
     def test_handle_existing(self):
         download = dl.Downloader('2005-07-20',do_mode='json')
-        self.assertIn(download.status,['extractedFilesdeletedZip','existingFiles'])
+        self.assertIn(download.status, ['extractedFilesdeletedZip', 'existingFiles'])
 
     def test_handle_empty(self):
-        download = dl.Downloader('2017-01-02',do_mode='json')
-        self.assertEqual(download.status,'downloadFailure')
+        download = dl.Downloader('2017-01-02', do_mode='json')
+        self.assertEqual(download.status, 'downloadFailure')