Skip to content

Commit

Permalink
Merge 857d916 into 6c320fa
Browse files Browse the repository at this point in the history
  • Loading branch information
hugovk committed Nov 27, 2015
2 parents 6c320fa + 857d916 commit 426c0bd
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 219 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,4 @@ target/
/legislators-current.yaml
/cache/
/congress-legislators
congress/metadata/A000000.yaml
46 changes: 26 additions & 20 deletions scripts/gpo_member_photos.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,26 +15,24 @@
try:
# Python 3
from urllib.error import HTTPError
from urllib.parse import parse_qs
from urllib.parse import urlparse
from urllib.parse import urlencode
from urllib.request import urlretrieve
except ImportError:
# Python 2
from urllib import urlretrieve
from urllib2 import HTTPError
from urlparse import parse_qs
from urlparse import urlparse
from urllib import urlencode

# pip install -r requirements.txt
import mechanicalsoup
import yaml


# Windows cmd.exe cannot do Unicode so encode first
def print_it(text):
print(text.encode('utf-8'))
regex1 = re.compile(
'<h2><a href="https://www.congress.gov/member/[^/]+/(\w+)\?'
'resultIndex=\d+">[^<]+</a></h2>\s*<div class="memberImage">'
'<img src="/img/member/([^"]+)\"')

regex2 = re.compile('<a class="next" id="pagebottom_next" href="([^"]+)">')


def pause(last, delay):
Expand All @@ -61,24 +59,29 @@ def get_photo_list(br, congress_number, delay):
response = br.get(
"https://www.congress.gov/search?"
+ urlencode({
"q": json.dumps({ "source": "members", "congress": str(congress_number) }),
"q": json.dumps(
{"source": "members",
"congress": str(congress_number)}),
"pageSize": 250,
"page": page,
})).text

if len(response) == 0:
sys.exit("Page is blank. Try again later, you may have hit a limit.")

# Scan for links to Member pages and img tags. The link to the Congress.gov
# page uses the Member's Bioguide ID as the key, and the filename for the
# photo is the same file name found at memberguide.gpo.gov for the high
# resolution file.
for bioguide_id, photo_file in re.findall("""<h2><a href="https://www.congress.gov/member/[^/]+/(\w+)\?resultIndex=\d+">[^<]+</a></h2>\s*<div class="memberImage"><img src="/img/member/([^"]+)\"""", response):
photo_file = photo_file.replace("_200.jpg", ".jpg") # this part is added by Congress.gov
if photo_file == bioguide_id.lower() + ".jpg": continue # not a file sourced from GPO
sys.exit("Page is blank. Try again later, you may have hit a "
"limit.")

# Scan for links to Member pages and img tags. The link to the
# Congress.gov page uses the Member's Bioguide ID as the key, and the
# filename for the photo is the same file name found at
# memberguide.gpo.gov for the high-resolution file.
for bioguide_id, photo_file in regex1.findall(response):
# this part is added by Congress.gov:
photo_file = photo_file.replace("_200.jpg", ".jpg")
if photo_file == bioguide_id.lower() + ".jpg":
continue # not a file sourced from GPO
yield (bioguide_id, photo_file)

m = re.search("""<a class="next" id="pagebottom_next" href="([^"]+)">""", response)
m = regex2.search(response)
if m:
# fetch next page of results
page += 1
Expand Down Expand Up @@ -111,6 +114,7 @@ def download_file(url, outfile):
os.unlink(fn)
raise HTTPError()


def download_photos(br, photo_list, outdir, delay):
last_request_time = None

Expand All @@ -120,7 +124,8 @@ def download_photos(br, photo_list, outdir, delay):
ok = 0

for bioguide_id, photo_filename in photo_list:
photo_url = "http://www.memberguide.gpo.gov/PictorialImages/" + photo_filename
photo_url = ("http://www.memberguide.gpo.gov/PictorialImages/" +
photo_filename)
print(bioguide_id, photo_url)

filename = os.path.join(outdir, bioguide_id + ".jpg")
Expand All @@ -138,6 +143,7 @@ def download_photos(br, photo_list, outdir, delay):

print("Downloaded", ok, "member photos.")


def resize_photos():
# Assumes they're congress/original/*.jpg
os.system(os.path.join("scripts", "resize-photos.sh"))
Expand Down
32 changes: 29 additions & 3 deletions scripts/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,33 @@
"""
from __future__ import print_function
import os
import gpo_member_photos

# pip install -r requirements.txt
import yaml


# Make sure we have the congress-legislators repository available.
def download_legislator_data():
# clone it if it's not out
if not os.path.exists("congress-legislators"):
print("Cloning the congress-legislators repo...")
os.system("git clone -q --depth 1 "
"https://github.com/unitedstates/congress-legislators "
"congress-legislators")

# Update the repo so we have the latest.
print("Updating the congress-legislators repo...")
# these two == git pull, but git pull ignores -q on the merge part
# so is less quiet
os.system("cd congress-legislators; git fetch -pq; "
"git merge --ff-only -q origin/master")


def load_yaml(filename):
f = open(filename)
data = yaml.safe_load(f)
f.close()
return data


def file_exists(filename):
Expand All @@ -19,9 +45,9 @@ def file_exists(filename):

if __name__ == "__main__":
# clone or update legislator YAML
gpo_member_photos.download_legislator_data()
download_legislator_data()

legislators = gpo_member_photos.load_yaml(
legislators = load_yaml(
"congress-legislators/legislators-current.yaml")
for l in legislators:
bioguide = l['id']['bioguide']
Expand Down
205 changes: 9 additions & 196 deletions test/test_gpo_member_photos.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
`python test/test_gpo_member_photos.py`
"""
from __future__ import print_function, unicode_literals
import os
import sys
try:
import unittest2 as unittest
Expand All @@ -18,203 +19,15 @@

class TestSequenceFunctions(unittest.TestCase):

yaml_data = None
def test_save_metadata(self):
""" Test file is saved """
bioguide_id = "A000000"
gpo_member_photos.save_metadata(bioguide_id)
self.assertTrue(os.path.exists("congress/metadata/A000000.yaml"))

def setUp(self):
if self.yaml_data is None:
self.__class__.yaml_data = gpo_member_photos.load_yaml(
"test/legislators-test.yaml")
self.assertTrue(len(self.yaml_data))

# Test bioguide_id_from_url()

def test_bioguide_id_from_url__last_char_not_slash(self):
""" Test last char is not / """
input = ("http://bioguide.congress.gov/scripts/biodisplay.pl"
"?index=S001177/")
output = gpo_member_photos.bioguide_id_from_url(input)
self.assertNotEqual(output[-1], "/")

def test_bioguide_id_from_url__last_char_not_slash2(self):
""" Test last char is not / """
input = ("http://bioguide.congress.gov/scripts/biodisplay.pl"
"?index=S001177")
output = gpo_member_photos.bioguide_id_from_url(input)
self.assertNotEqual(output[-1], "/")

def test_bioguide_id_from_url__is_string(self):
""" Test output is string """
input = ("http://bioguide.congress.gov/scripts/biodisplay.pl"
"?index=S001177/")
output = gpo_member_photos.bioguide_id_from_url(input)
self.assertIsInstance(output, str)

def test_bioguide_id_from_url__uppercase(self):
""" Test output is string """
input = ("http://bioguide.congress.gov/scripts/biodisplay.pl"
"?index=e000288/")
output = gpo_member_photos.bioguide_id_from_url(input)
self.assertEqual(output[0], "E")

def test_bioguide_id_from_url_with_ltr_mark(self):
""" For some reason, some new URL links end with
Unicode Character 'LEFT-TO-RIGHT MARK' (U+200E) """
input = ("http://bioguide.congress.gov/scripts/biodisplay.pl"
"?index=g000386" + u"\u200E" + "/")
output = gpo_member_photos.bioguide_id_from_url(input)
self.assertEqual(output, "G000386")

# Test bioguide_id_valid()

def test_bioguide_id_valid__none_returns_false(self):
""" Test with None """
input = None
output = gpo_member_photos.bioguide_id_valid(input)
self.assertFalse(output)

def test_bioguide_id_valid__returns_true(self):
""" Test with a valid ID """
input = "K000362"
output = gpo_member_photos.bioguide_id_valid(input)
self.assertTrue(output)

def test_bioguide_id_valid__returns_false(self):
""" Test with an invalid ID """
input = "aK000362z"
output = gpo_member_photos.bioguide_id_valid(input)
self.assertFalse(output)

def test_bioguide_id_valid_url__returns_false(self):
""" Test with an invalid ID, an URL """
input = "http://young.house.gov"
output = gpo_member_photos.bioguide_id_valid(input)
self.assertFalse(output)

def test_bioguide_id_valid_url__first_not_cap(self):
""" Test with lower case initial """
input = "r000515"
output = gpo_member_photos.bioguide_id_valid(input)
self.assertFalse(output)

# Test remove_from_yaml()

def test_remove_from_yaml__success(self):
""" Test smaller after remove """
bioguide_id = "C000127"
length_before = len(self.yaml_data)

self.yaml_data = gpo_member_photos.remove_from_yaml(self.yaml_data,
bioguide_id)
self.assertTrue(length_before > len(self.yaml_data))
self.assertEqual(len(self.yaml_data) + 1, length_before)

def test_remove_from_yaml__not_found(self):
""" Test same size """
bioguide_id = "NOT_THERE"
length_before = len(self.yaml_data)
self.yaml_data = gpo_member_photos.remove_from_yaml(self.yaml_data,
bioguide_id)
self.assertEqual(len(self.yaml_data), length_before)

# Test reverse_names()

def test_reverse_names(self):
""" Test reversing names """
text = "Hagan, Kay R."
output = gpo_member_photos.reverse_names(text)
self.assertEqual(output, "Kay R. Hagan")

# Test resolve()

def test_resolve__exact_match_last_first(self):
""" Test resolve """
text = "Alexander, Lamar"
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "A000360")

def test_resolve__exact_match_last_first_middle(self):
""" Test resolve """
text = "Amodei, Mark E."
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "A000369")

def test_resolve__exact_match_last_nickname(self):
""" Test resolve """
text = "Isakson, Johnny"
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "I000055")

def test_resolve__with_accented_chars(self):
""" Test resolve """
text = u"Velázquez, Nydia M."
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "V000081")

def test_resolve__initial_dot_from_middle(self):
""" Test resolve """
text = "Kirk, Mark S."
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "K000360")

def test_resolve__initial_not_in_yaml(self):
""" Test resolve """
text = "Ayotte, Kelly A."
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "A000368")

def test_resolve__remove_nickname_quotes(self):
""" Test resolve """
text = 'Barr, Garland “Andy"'
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "B001282")

def test_resolve__quoted_nickname(self):
""" Test resolve """
text = 'Fleischmann, Charles J. “Chuck"'
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "F000459")

def test_resolve__missing_accents(self):
""" Test resolve """
text = "Cardenas, Tony"
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "C001097")

def test_resolve__partial_firstname(self):
""" Test resolve e.g. Michael to Mike """
text = "Lee, Michael S."
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "L000577")

def test_resolve__b001289(self):
""" Test resolve special case """
text = "Bradley, Byrne"
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "B001289")

def test_resolve__c001089(self):
""" Test resolve special case """
text = "Curson, David Alan"
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "C001089")

def test_resolve__g000535(self):
""" Test resolve special case """
text = "Gutierrez, Luis"
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, "G000535")

def test_resolve__empty_text(self):
""" Test resolve special case """
text = ""
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, None)

def test_resolve__none(self):
""" Test resolve special case """
text = None
output = gpo_member_photos.resolve(self.yaml_data, text)
self.assertEqual(output, None)
def test_resize_photos(self):
""" Test callable """
gpo_member_photos.resize_photos()

if __name__ == '__main__':
unittest.main()
Expand Down

0 comments on commit 426c0bd

Please sign in to comment.