diff --git a/.gitignore b/.gitignore index 17fa8507..24068b18 100644 --- a/.gitignore +++ b/.gitignore @@ -68,3 +68,4 @@ target/ /legislators-current.yaml /cache/ /congress-legislators +congress/metadata/A000000.yaml diff --git a/scripts/gpo_member_photos.py b/scripts/gpo_member_photos.py index 22f58463..e426f59f 100755 --- a/scripts/gpo_member_photos.py +++ b/scripts/gpo_member_photos.py @@ -15,26 +15,24 @@ try: # Python 3 from urllib.error import HTTPError - from urllib.parse import parse_qs - from urllib.parse import urlparse from urllib.parse import urlencode from urllib.request import urlretrieve except ImportError: # Python 2 from urllib import urlretrieve from urllib2 import HTTPError - from urlparse import parse_qs - from urlparse import urlparse from urllib import urlencode # pip install -r requirements.txt import mechanicalsoup -import yaml -# Windows cmd.exe cannot do Unicode so encode first -def print_it(text): - print(text.encode('utf-8')) +regex1 = re.compile( + '

[^<]+

\s*
' + '') def pause(last, delay): @@ -61,24 +59,29 @@ def get_photo_list(br, congress_number, delay): response = br.get( "https://www.congress.gov/search?" + urlencode({ - "q": json.dumps({ "source": "members", "congress": str(congress_number) }), + "q": json.dumps( + {"source": "members", + "congress": str(congress_number)}), "pageSize": 250, "page": page, })).text if len(response) == 0: - sys.exit("Page is blank. Try again later, you may have hit a limit.") - - # Scan for links to Member pages and img tags. The link to the Congress.gov - # page uses the Member's Bioguide ID as the key, and the filename for the - # photo is the same file name found at memberguide.gpo.gov for the high - # resolution file. - for bioguide_id, photo_file in re.findall("""

[^<]+

\s*
""", response) + m = regex2.search(response) if m: # fetch next page of results page += 1 @@ -111,6 +114,7 @@ def download_file(url, outfile): os.unlink(fn) raise HTTPError() + def download_photos(br, photo_list, outdir, delay): last_request_time = None @@ -120,7 +124,8 @@ def download_photos(br, photo_list, outdir, delay): ok = 0 for bioguide_id, photo_filename in photo_list: - photo_url = "http://www.memberguide.gpo.gov/PictorialImages/" + photo_filename + photo_url = ("http://www.memberguide.gpo.gov/PictorialImages/" + + photo_filename) print(bioguide_id, photo_url) filename = os.path.join(outdir, bioguide_id + ".jpg") @@ -138,6 +143,7 @@ def download_photos(br, photo_list, outdir, delay): print("Downloaded", ok, "member photos.") + def resize_photos(): # Assumes they're congress/original/*.jpg os.system(os.path.join("scripts", "resize-photos.sh")) diff --git a/scripts/missing.py b/scripts/missing.py index ca6d1dda..adafd501 100644 --- a/scripts/missing.py +++ b/scripts/missing.py @@ -5,7 +5,33 @@ """ from __future__ import print_function import os -import gpo_member_photos + +# pip install -r requirements.txt +import yaml + + +# Make sure we have the congress-legislators repository available. +def download_legislator_data(): + # clone it if it's not out + if not os.path.exists("congress-legislators"): + print("Cloning the congress-legislators repo...") + os.system("git clone -q --depth 1 " + "https://github.com/unitedstates/congress-legislators " + "congress-legislators") + + # Update the repo so we have the latest. + print("Updating the congress-legislators repo...") + # these two == git pull, but git pull ignores -q on the merge part + # so is less quiet + os.system("cd congress-legislators; git fetch -pq; " + "git merge --ff-only -q origin/master") + + +def load_yaml(filename): + f = open(filename) + data = yaml.safe_load(f) + f.close() + return data def file_exists(filename): @@ -19,9 +45,9 @@ def file_exists(filename): if __name__ == "__main__": # clone or update legislator YAML - gpo_member_photos.download_legislator_data() + download_legislator_data() - legislators = gpo_member_photos.load_yaml( + legislators = load_yaml( "congress-legislators/legislators-current.yaml") for l in legislators: bioguide = l['id']['bioguide'] diff --git a/test/test_gpo_member_photos.py b/test/test_gpo_member_photos.py index 4d19b29a..48bbf0b6 100644 --- a/test/test_gpo_member_photos.py +++ b/test/test_gpo_member_photos.py @@ -6,6 +6,7 @@ `python test/test_gpo_member_photos.py` """ from __future__ import print_function, unicode_literals +import os import sys try: import unittest2 as unittest @@ -18,203 +19,15 @@ class TestSequenceFunctions(unittest.TestCase): - yaml_data = None + def test_save_metadata(self): + """ Test file is saved """ + bioguide_id = "A000000" + gpo_member_photos.save_metadata(bioguide_id) + self.assertTrue(os.path.exists("congress/metadata/A000000.yaml")) - def setUp(self): - if self.yaml_data is None: - self.__class__.yaml_data = gpo_member_photos.load_yaml( - "test/legislators-test.yaml") - self.assertTrue(len(self.yaml_data)) - - # Test bioguide_id_from_url() - - def test_bioguide_id_from_url__last_char_not_slash(self): - """ Test last char is not / """ - input = ("http://bioguide.congress.gov/scripts/biodisplay.pl" - "?index=S001177/") - output = gpo_member_photos.bioguide_id_from_url(input) - self.assertNotEqual(output[-1], "/") - - def test_bioguide_id_from_url__last_char_not_slash2(self): - """ Test last char is not / """ - input = ("http://bioguide.congress.gov/scripts/biodisplay.pl" - "?index=S001177") - output = gpo_member_photos.bioguide_id_from_url(input) - self.assertNotEqual(output[-1], "/") - - def test_bioguide_id_from_url__is_string(self): - """ Test output is string """ - input = ("http://bioguide.congress.gov/scripts/biodisplay.pl" - "?index=S001177/") - output = gpo_member_photos.bioguide_id_from_url(input) - self.assertIsInstance(output, str) - - def test_bioguide_id_from_url__uppercase(self): - """ Test output is string """ - input = ("http://bioguide.congress.gov/scripts/biodisplay.pl" - "?index=e000288/") - output = gpo_member_photos.bioguide_id_from_url(input) - self.assertEqual(output[0], "E") - - def test_bioguide_id_from_url_with_ltr_mark(self): - """ For some reason, some new URL links end with - Unicode Character 'LEFT-TO-RIGHT MARK' (U+200E) """ - input = ("http://bioguide.congress.gov/scripts/biodisplay.pl" - "?index=g000386" + u"\u200E" + "/") - output = gpo_member_photos.bioguide_id_from_url(input) - self.assertEqual(output, "G000386") - - # Test bioguide_id_valid() - - def test_bioguide_id_valid__none_returns_false(self): - """ Test with None """ - input = None - output = gpo_member_photos.bioguide_id_valid(input) - self.assertFalse(output) - - def test_bioguide_id_valid__returns_true(self): - """ Test with a valid ID """ - input = "K000362" - output = gpo_member_photos.bioguide_id_valid(input) - self.assertTrue(output) - - def test_bioguide_id_valid__returns_false(self): - """ Test with an invalid ID """ - input = "aK000362z" - output = gpo_member_photos.bioguide_id_valid(input) - self.assertFalse(output) - - def test_bioguide_id_valid_url__returns_false(self): - """ Test with an invalid ID, an URL """ - input = "http://young.house.gov" - output = gpo_member_photos.bioguide_id_valid(input) - self.assertFalse(output) - - def test_bioguide_id_valid_url__first_not_cap(self): - """ Test with lower case initial """ - input = "r000515" - output = gpo_member_photos.bioguide_id_valid(input) - self.assertFalse(output) - - # Test remove_from_yaml() - - def test_remove_from_yaml__success(self): - """ Test smaller after remove """ - bioguide_id = "C000127" - length_before = len(self.yaml_data) - - self.yaml_data = gpo_member_photos.remove_from_yaml(self.yaml_data, - bioguide_id) - self.assertTrue(length_before > len(self.yaml_data)) - self.assertEqual(len(self.yaml_data) + 1, length_before) - - def test_remove_from_yaml__not_found(self): - """ Test same size """ - bioguide_id = "NOT_THERE" - length_before = len(self.yaml_data) - self.yaml_data = gpo_member_photos.remove_from_yaml(self.yaml_data, - bioguide_id) - self.assertEqual(len(self.yaml_data), length_before) - - # Test reverse_names() - - def test_reverse_names(self): - """ Test reversing names """ - text = "Hagan, Kay R." - output = gpo_member_photos.reverse_names(text) - self.assertEqual(output, "Kay R. Hagan") - - # Test resolve() - - def test_resolve__exact_match_last_first(self): - """ Test resolve """ - text = "Alexander, Lamar" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "A000360") - - def test_resolve__exact_match_last_first_middle(self): - """ Test resolve """ - text = "Amodei, Mark E." - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "A000369") - - def test_resolve__exact_match_last_nickname(self): - """ Test resolve """ - text = "Isakson, Johnny" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "I000055") - - def test_resolve__with_accented_chars(self): - """ Test resolve """ - text = u"Velázquez, Nydia M." - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "V000081") - - def test_resolve__initial_dot_from_middle(self): - """ Test resolve """ - text = "Kirk, Mark S." - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "K000360") - - def test_resolve__initial_not_in_yaml(self): - """ Test resolve """ - text = "Ayotte, Kelly A." - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "A000368") - - def test_resolve__remove_nickname_quotes(self): - """ Test resolve """ - text = 'Barr, Garland “Andy"' - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "B001282") - - def test_resolve__quoted_nickname(self): - """ Test resolve """ - text = 'Fleischmann, Charles J. “Chuck"' - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "F000459") - - def test_resolve__missing_accents(self): - """ Test resolve """ - text = "Cardenas, Tony" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "C001097") - - def test_resolve__partial_firstname(self): - """ Test resolve e.g. Michael to Mike """ - text = "Lee, Michael S." - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "L000577") - - def test_resolve__b001289(self): - """ Test resolve special case """ - text = "Bradley, Byrne" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "B001289") - - def test_resolve__c001089(self): - """ Test resolve special case """ - text = "Curson, David Alan" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "C001089") - - def test_resolve__g000535(self): - """ Test resolve special case """ - text = "Gutierrez, Luis" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, "G000535") - - def test_resolve__empty_text(self): - """ Test resolve special case """ - text = "" - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, None) - - def test_resolve__none(self): - """ Test resolve special case """ - text = None - output = gpo_member_photos.resolve(self.yaml_data, text) - self.assertEqual(output, None) + def test_resize_photos(self): + """ Test callable """ + gpo_member_photos.resize_photos() if __name__ == '__main__': unittest.main()