Skip to content

Commit

Permalink
Dev/issue 41 (#42)
Browse files Browse the repository at this point in the history
* replace logging.warn (deprecated) with logging.warning

* cosmetic changes/PEP-8

* Fix #41

* retain backwards compatibility

* retain backwards compatibility (for 11 more days lol)

* increment version number

* SSOT for version
  • Loading branch information
nclarkjudd committed Dec 20, 2019
1 parent ad6b222 commit e71392b
Show file tree
Hide file tree
Showing 5 changed files with 46 additions and 31 deletions.
5 changes: 4 additions & 1 deletion congressionalrecord/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
__version__ = '2.0.1'
import pkg_resources # part of setuptools
VERSION = pkg_resources.require("congressionalrecord")[0].version

__version__ = VERSION
8 changes: 5 additions & 3 deletions congressionalrecord/govinfo/cr_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import logging
import itertools


class ParseCRDir(object):

def gen_dir_metadata(self):
Expand All @@ -25,7 +26,8 @@ def __init__(self, abspath, **kwargs):
self.mods_path = os.path.join(self.cr_dir,'mods.xml')
self.html_path = os.path.join(self.cr_dir,'html')
self.gen_dir_metadata()



class ParseCRFile(object):
# Some regex
re_time = r'^CREC-(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{2})-.*'
Expand Down Expand Up @@ -204,7 +206,7 @@ def gen_file_metadata(self):
if matchobj:
self.doc_title, self.cr_vol, self.cr_num = matchobj.group('title','vol','num')
else:
logging.warn('{0} yields no title, vol, num'.format(
logging.warning('{0} yields no title, vol, num'.format(
self.access_path))
self.doc_title, self.cr_vol, self.cr_num = \
'None','Unknown','Unknown'
Expand Down Expand Up @@ -343,7 +345,7 @@ def write_page(self):
itemno += 1
the_content.append(item)
except Exception as e:
logging.warn('{0}'.format(e))
logging.warning('{0}'.format(e))
break

self.crdoc['content'] = the_content
Expand Down
49 changes: 29 additions & 20 deletions congressionalrecord/govinfo/downloader.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
from __future__ import absolute_import
import os
import json
import logging
import pkg_resources # part of setuptools
import certifi
import urllib3.contrib.pyopenssl
#import requests
from builtins import str
from builtins import object
import certifi
import urllib3.contrib.pyopenssl
urllib3.contrib.pyopenssl.inject_into_urllib3()
from urllib3 import PoolManager, Retry, Timeout
import os
from datetime import datetime, date, timedelta
from io import BytesIO
from time import sleep
from zipfile import ZipFile
from zipfile import ZipFile, BadZipfile
from .cr_parser import ParseCRDir, ParseCRFile
import json
from pyelasticsearch import ElasticSearch, bulk_chunks
import logging

urllib3.contrib.pyopenssl.inject_into_urllib3()
VERSION = pkg_resources.require("congressionalrecord")[0].version


class Downloader(object):
Expand Down Expand Up @@ -59,7 +63,6 @@ def bulkdownload(self, start, parse=True, **kwargs):
logging.warning('Unexpected condition in bulkdownloader')
day += timedelta(days=1)


def __init__(self, start, **kwargs):
"""
Invoke a Downloader object to get data from
Expand Down Expand Up @@ -140,12 +143,10 @@ def __init__(self, start, **kwargs):
return None





class downloadRequest(object):

user_agent = {'user-agent': 'congressional-record 0.0.1 (https://github.com/unitedstates/congressional-record)'}
user_agent = {'user-agent':
'congressional-record {} (https://github.com/unitedstates/congressional-record)'.format(VERSION)}
its_today = datetime.strftime(datetime.today(), '%Y-%m-%d %H:%M')
timeout = Timeout(connect=2.0, read=10.0)
retry = Retry(total=3, backoff_factor=300)
Expand All @@ -162,19 +163,26 @@ def __init__(self, url, filename):
r = self.http.request('GET', url)
logging.debug('Request headers received with code {}'.format(r.status))
if r.status == 404:
logging.warn('Received 404, not retrying request.')
logging.warning('Received 404, not retrying request.')
self.status = 404
elif r.status == 200 and r.data:
logging.info('Considering request successful.')
self.binary_content = r.data
self.status = True
logging.info('Considering download request successful.')
logging.info('Sniff sniff: Does this smell like a ZIP file?')
with BytesIO(r.data) as thepackage:
try:
isazip = ZipFile(thepackage)
self.binary_content = r.data
self.status = True
except BadZipfile:
logging.warning('File {} is not a valid ZIP file (BadZipFile)'.format(url))
self.status = False
else:
logging.warn('Unexpected condition, not continuing:\
logging.warning('Unexpected condition, not continuing:\
{}'.format(r.status))
except urllib3.exceptions.MaxRetryError as ce:
logging.warn('Error: %s - Aborting download' % ce)
logging.warning('Error: %s - Aborting download' % ce)
if self.status == False:
logging.warn('Failed to download file {}'.format(url))
logging.warning('Failed to download file {}'.format(url))
elif self.status == 404:
logging.info('downloadRequester skipping file that returned 404.')
elif self.binary_content:
Expand All @@ -199,7 +207,7 @@ def download_day(self, day, outpath):
the_download = downloadRequest(the_url, the_filename)
self.status = the_download.status
if self.status == False:
logging.warn("fdsysDL received report that download for {} did not complete.".format(day))
logging.warning("fdsysDL received report that download for {} did not complete.".format(day))
elif self.status == 404:
logging.warning('fdsysDL received 404 report for {}.'.format(day))
else:
Expand All @@ -213,6 +221,7 @@ def __init__(self, day, **kwargs):
self.outpath = 'output'
self.download_day(day, self.outpath)


class GovInfoExtract(object):

def __init__(self, day, **kwargs):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='congressionalrecord',
version='2.0.1',
version='2.0.2',
description='Parse the U.S. Congressional Record from GovInfo.',
url='https://github.com/unitedstates/congressional-record',
author='Nick Judd',
Expand Down
13 changes: 7 additions & 6 deletions tests/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,19 @@
import re
import logging

logging.basicConfig(filename='tests.log',level=logging.DEBUG)
logging.basicConfig(filename='tests.log', level=logging.DEBUG)


class testDownloader(unittest.TestCase):

def test_handle_404(self):
download = dl.Downloader('2015-07-19',do_mode='json')
self.assertEqual(download.status,'downloadFailure')
download = dl.Downloader('2015-07-19', do_mode='json')
self.assertEqual(download.status, 'downloadFailure')

def test_handle_existing(self):
download = dl.Downloader('2005-07-20',do_mode='json')
self.assertIn(download.status,['extractedFilesdeletedZip','existingFiles'])
self.assertIn(download.status, ['extractedFilesdeletedZip', 'existingFiles'])

def test_handle_empty(self):
download = dl.Downloader('2017-01-02',do_mode='json')
self.assertEqual(download.status,'downloadFailure')
download = dl.Downloader('2017-01-02', do_mode='json')
self.assertEqual(download.status, 'downloadFailure')

0 comments on commit e71392b

Please sign in to comment.