Permalink
Browse files

[#1002] Replacing all ad hoc name standardization with Name Cleaver

  • Loading branch information...
1 parent 810b856 commit 89321c2d7897ca778b7080cdf1b8bdadddaafa51 @arowla arowla committed Nov 23, 2011
View
@@ -1,7 +1,7 @@
# Create your views here.
import json, csv, os
from django.http import HttpResponse
-from influence.names import standardize_name
+from influence.helpers import standardize_name
from django.template.defaultfilters import slugify
from django.contrib.localflavor.us.us_states import US_STATES
View
@@ -1,13 +1,24 @@
from django.http import Http404
from django.template.defaultfilters import slugify
-from influence import external_sites
-from influence.names import standardize_name
from influenceexplorer import DEFAULT_CYCLE
from settings import api, LATEST_CYCLE
import datetime
import googleanalytics
import re
from django.utils.datastructures import SortedDict
+from name_cleaver import PoliticianNameCleaver, OrganizationNameCleaver, \
+ IndividualNameCleaver
+
+
+_standardizers = {
+ 'politician': lambda n: PoliticianNameCleaver(n).parse(),
+ 'individual': lambda n: IndividualNameCleaver(n).parse(),
+ 'industry': lambda n: OrganizationNameCleaver(n).parse(),
+ 'organization': lambda n: OrganizationNameCleaver(n).parse(),
+}
+
+def standardize_name(name, type):
+ return _standardizers[type](name)
def bar_validate(data):
''' take a dict formatted for submission to the barchart
View
@@ -1,90 +0,0 @@
-import re
-import string
-from name_cleaver import PoliticianNameCleaver
-
-
-def standardize_individual_name(name):
- name, honorific, suffix = separate_affixes(name)
-
- name = convert_name_to_first_last(name)
- name = ' '.join([x for x in [
- honorific if honorific and honorific.lower() == 'mrs' else None,
- name,
- suffix
- ] if x])
- name = re.sub(r'\d{2,}\s*$', '', name) # strip any trailing numbers
- name = re.sub(r'^(?i)\s*mr\.?\s+', '', name) # strip leading 'Mr' if not caught by the other algorithm (e.g. the name was in first last format to begin with)
-
- return convert_case(name)
-
-def standardize_organization_name(name):
- name = name.strip()
- name = convert_case(name)
-
- if re.match(r'(?i)^\w*PAC$', name):
- name = name.upper() # if there's only one word that ends in PAC, make the whole thing uppercase
- else:
- name = re.sub(r'(?i)\bpac\b', 'PAC', name) # otherwise just uppercase the PAC part
-
- return name
-
-def standardize_industry_name(name):
- name = convert_case(name)
- name = name.strip()
- name = re.sub(r'/([a-z])', lambda s: s.group().upper(), name)
- name = re.sub(r'-([a-z])', lambda s: s.group().upper(), name)
-
- return name
-
-_standardizers = {
- 'politician': lambda n: PoliticianNameCleaver(n).parse(),
- 'individual': standardize_individual_name,
- 'industry': standardize_industry_name,
- 'organization': standardize_organization_name,
-}
-
-def standardize_name(name, type):
- return _standardizers[type](name)
-
-def separate_affixes(name):
- # this should match both honorifics (mr/mrs/ms) and jr/sr/II/III
- matches = re.search(r'^\s*(?P<name>.*)\b((?P<honorific>m[rs]s?.?)|(?P<suffix>([js]r|I{2,})))[.,]?\s*$', name, re.IGNORECASE)
- if matches:
- return matches.group('name', 'honorific', 'suffix')
- else:
- return name, None, None
-
-def convert_case(name):
- name = name if is_mixed_case(name) else string.capwords(name)
- name = uppercase_roman_numeral_suffix(name)
- return uppercase_the_scots(name)
-
-def uppercase_roman_numeral_suffix(name):
- matches = re.search(r'(?i)(?P<suffix>\b[ivx]+)$', name)
- if matches:
- suffix = matches.group('suffix')
- return re.sub(suffix, suffix.upper(), name)
- else:
- return name
-
-def uppercase_the_scots(name):
- matches = re.search(r'(?i)\b(?P<mc>ma?c)(?P<first_letter>\w)', name)
- if matches:
- mc = matches.group('mc')
- first_letter = matches.group('first_letter')
- return re.sub(mc + first_letter, mc.title() + first_letter.upper(), name)
- else:
- return name
-
-def is_mixed_case(name):
- return re.search(r'[A-Z][a-z]', name)
-
-def convert_name_to_first_last(name):
- split = name.split(',')
- if len(split) == 1: return split[0]
-
- trimmed_split = [ x.strip() for x in split ]
-
- trimmed_split.reverse()
- return ' '.join(trimmed_split)
-
View
@@ -1,7 +1,7 @@
from django.contrib.sitemaps import Sitemap
from django.contrib.sitemaps.views import index, sitemap
from django.conf import settings
-from influence import names
+from influence import helpers
from django.template.defaultfilters import slugify
from django.http import HttpResponse
import os
@@ -40,7 +40,7 @@ class EntitySitemap(Sitemap):
def __init__(self):
entity_type = self.entity_type or 'individual'
- self.cleaner = names._standardizers[entity_type]
+ self.cleaner = helpers._standardizers[entity_type]
def clean(self, string):
return slugify(self.cleaner(string))
@@ -1,8 +1,7 @@
from django import template
from django.template.defaultfilters import stringfilter
-from influence.names import standardize_individual_name, standardize_organization_name, \
- standardize_industry_name
-from name_cleaver import PoliticianNameCleaver
+from name_cleaver import PoliticianNameCleaver, IndividualNameCleaver, \
+ OrganizationNameCleaver
register = template.Library()
@@ -16,17 +15,17 @@ def standardize_politician_name_filter(name):
@register.filter(name='standardize_individual_name')
@stringfilter
def standardize_individual_name_filter(name):
- return standardize_individual_name(name)
+ return str(IndividualNameCleaver(name).parse())
@register.filter(name='standardize_organization_name')
@stringfilter
def standardize_organization_name_filter(name):
- return standardize_organization_name(name)
+ return str(OrganizationNameCleaver(name).parse())
@register.filter(name='standardize_industry_name')
@stringfilter
def standardize_industry_name_filter(name):
- return standardize_industry_name(name)
+ return str(OrganizationNameCleaver(name).parse())
seat_labels = {'federal:senate': 'US Senate',
@@ -109,4 +108,4 @@ def sunlight_author_uri(value):
shortname = "%s%s"%(value[0][:1],value[-1])
else:
shortname = value[0]
- return "%s%s"%(SUNLIGHT_STAFF_BASE_URI, shortname)
+ return "%s%s"%(SUNLIGHT_STAFF_BASE_URI, shortname)
View
@@ -1,6 +1,4 @@
from django.test import TestCase
-
-from influence import names
from settings import api
CYCLE = 2008
@@ -110,52 +108,3 @@ def test_reg_issues(self):
self.assertLength(10, api.org.registrant_issues(self.NICKLES, CYCLE))
-class IndividualNameStandardizationTests(TestCase):
-
- def test_all_kinds_of_crazy(self):
- self.assertEqual('Stanford Z Rothschild', names.standardize_individual_name('ROTHSCHILD 212, STANFORD Z MR'))
-
- def test_jr_and_the_like_end_up_at_the_end(self):
- self.assertEqual('Frederick A "Tripp" Baird III', names.standardize_individual_name('Baird, Frederick A "Tripp" III'))
-
- def test_throw_out_mr(self):
- self.assertEqual('T Boone Pickens', names.standardize_individual_name('Mr T Boone Pickens'))
- self.assertEqual('T Boone Pickens', names.standardize_individual_name('Mr. T Boone Pickens'))
- self.assertEqual('T Boone Pickens', names.standardize_individual_name('Pickens, T Boone Mr'))
- self.assertEqual('John L Nau', names.standardize_individual_name(' MR JOHN L NAU,'))
-
- def test_keep_the_mrs(self):
- self.assertEqual('Mrs T Boone Pickens', names.standardize_individual_name('Mrs T Boone Pickens'))
- self.assertEqual('Mrs. T Boone Pickens', names.standardize_individual_name('Mrs. T Boone Pickens'))
- self.assertEqual('Mrs Stanford Z Rothschild', names.standardize_individual_name('ROTHSCHILD 212, STANFORD Z MRS'))
-
- def test_capitalize_roman_numeral_suffixes(self):
- self.assertEqual('Ken Cuccinelli II', names.standardize_individual_name('KEN CUCCINELLI II'))
- self.assertEqual('Ken Cuccinelli II', names.standardize_individual_name('CUCCINELLI II, KEN'))
- self.assertEqual('Ken Cuccinelli IV', names.standardize_individual_name('CUCCINELLI IV, KEN'))
- self.assertEqual('Ken Cuccinelli IX', names.standardize_individual_name('CUCCINELLI IX, KEN'))
-
- def test_capitalize_scottish_last_names(self):
- self.assertEqual('Ronald McDonald', names.standardize_individual_name('RONALD MCDONALD'))
- self.assertEqual('Old MacDonald', names.standardize_individual_name('OLD MACDONALD'))
-
-
-class OrganizationNameStandardizationTests(TestCase):
-
- def test_capitalize_pac(self):
- self.assertEqual('Nancy Pelosi Leadership PAC', names.standardize_organization_name('NANCY PELOSI LEADERSHIP PAC'))
-
- def test_make_single_word_names_ending_in_pac_all_uppercase(self):
- self.assertEqual('ECEPAC', names.standardize_organization_name('ECEPAC'))
-
- def test_names_starting_with_PAC(self):
- self.assertEqual('PAC For Engineers', names.standardize_organization_name('PAC FOR ENGINEERS'))
- self.assertEqual('PAC 102', names.standardize_organization_name('PAC 102'))
-
- def test_doesnt_bother_names_containing_string_pac(self):
- self.assertEqual('Pacific Trust', names.standardize_organization_name('PACIFIC TRUST'))
-
- def test_capitalize_scottish_names(self):
- self.assertEqual('McDonnell Douglas', names.standardize_individual_name('MCDONNELL DOUGLAS'))
- self.assertEqual('MacDonnell Douglas', names.standardize_individual_name('MACDONNELL DOUGLAS'))
-
View
@@ -12,10 +12,8 @@
from influence.helpers import prepare_entity_view, generate_label, barchart_href, \
bar_validate, pie_validate, months_into_cycle_for_date, \
filter_bad_spending_descriptions, make_bill_link, get_top_pages
-from influence.names import standardize_organization_name, \
- standardize_industry_name
from influenceexplorer import DEFAULT_CYCLE
-from name_cleaver import PoliticianNameCleaver
+from name_cleaver import PoliticianNameCleaver, OrganizationNameCleaver
from settings import LATEST_CYCLE, TOP_LISTS_CYCLE, api
from urllib2 import URLError
import datetime
@@ -207,7 +205,7 @@ def org_contribution_section(entity_id, standardized_name, cycle, amount, type,
if type == 'industry':
section['top_orgs'] = json.dumps([
{
- 'key': generate_label(standardize_organization_name(org['name'])),
+ 'key': generate_label(str(OrganizationNameCleaver(org['name']).parse())),
'value': org['total_amount'],
'value_employee': org['employee_amount'],
'value_pac': org['direct_amount'],
@@ -233,7 +231,7 @@ def org_contribution_section(entity_id, standardized_name, cycle, amount, type,
pacs_barchart_data = []
for record in recipient_pacs:
pacs_barchart_data.append({
- 'key': generate_label(standardize_organization_name(record['name'])),
+ 'key': generate_label(str(OrganizationNameCleaver(record['name']).parse())),
'value' : record['total_amount'],
'value_employee' : record['employee_amount'],
'value_pac' : record['direct_amount'],
@@ -473,7 +471,7 @@ def pol_contribution_section(entity_id, standardized_name, cycle, amount, extern
contributors_barchart_data = []
for record in top_contributors:
contributors_barchart_data.append({
- 'key': generate_label(standardize_organization_name(record['name'])),
+ 'key': generate_label(str(OrganizationNameCleaver(record['name']).parse())),
'value' : record['total_amount'],
'value_employee' : record['employee_amount'],
'value_pac' : record['direct_amount'],
@@ -485,7 +483,7 @@ def pol_contribution_section(entity_id, standardized_name, cycle, amount, extern
industries_barchart_data = []
for record in top_industries:
industries_barchart_data.append({
- 'key': generate_label(standardize_industry_name(record['name'])),
+ 'key': generate_label(str(OrganizationNameCleaver(record['name']).parse())),
'href': barchart_href(record, cycle, 'industry'),
'value' : record['amount'],
})
@@ -607,7 +605,7 @@ def indiv_contribution_section(entity_id, standardized_name, cycle, amount, exte
orgs_barchart_data = []
for record in recipient_orgs:
orgs_barchart_data.append({
- 'key': generate_label(standardize_organization_name(record['recipient_name'])),
+ 'key': generate_label(str(OrganizationNameCleaver(record['recipient_name']).parse())),
'value' : record['amount'],
'href' : barchart_href(record, cycle, entity_type="organization"),
})
@@ -9,8 +9,7 @@
from gevent import monkey
import gevent
from django.utils.datastructures import SortedDict
-from influence.helpers import standardize_organization_name, standardize_industry_name
-from name_cleaver import PoliticianNameCleaver
+from name_cleaver import PoliticianNameCleaver, OrganizationNameCleaver
from django.template.defaultfilters import slugify
import urllib2
import re
@@ -171,13 +170,13 @@ def fetch_photos(candidate):
contributions = SortedDict()
for cont in candidate['contributions'][:5]:
if float(cont['total_amount']) >= 0:
- contributions[standardize_organization_name(cont['name'])] = cont['total_amount']
+ contributions[str(OrganizationNameCleaver(cont['name']).parse())] = cont['total_amount']
# industry data
industries = SortedDict()
for cont in candidate['industries'][:5]:
if float(cont['amount']) >= 0:
- industries[standardize_industry_name(cont['name'])] = cont['amount']
+ industries[str(OrganizationNameCleaver(cont['name']).parse())] = cont['amount']
#deal with the name
name_obj = PoliticianNameCleaver(candidate['name']).parse()

0 comments on commit 89321c2

Please sign in to comment.