In [1]:
%load_ext autoreload
%autoreload 2

import re
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import matplotlib
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 12, 6
matplotlib.rcParams['font.size'] = 18  # Probably OS Dependent

In [3]:
data_files = glob('../public/*.json')

dfs = []
for file_ in data_files:
    df = pd.read_json(file_)
    df['state'] = re.search('public/(.+).json', file_).group(1).title()
    dfs += [df]
    
df = pd.concat(dfs).reset_index()
df.sample(5, random_state=422)

Unnamed: 0,index,locale,official,emails,faxes,phones,county,address,party,state,contacts,physicalAddress,city,url,faxs,urls
1357,1105,Town of Three Lakes:Oneida County,Susan L Harris,[TOWNCLERKSUE@TOWNOFTHREELAKES.COM],[(715)546-3384],,Oneida County,"Po Box 565, Three Lakes, Wi 54562-0565",,Wisconsin,,"6965 W School St, Three Lakes, Wi 54562",Town of Three Lakes,,,
1681,1429,Village of Radisson:Sawyer County,Gwen Genari,[vclerk@bevcomm.net],[],,Sawyer County,"Po Box 127, Radisson, Wi 54867-0127",,Wisconsin,,"10598 W Railroad St, Radisson, Wi 54867",Village of Radisson,,,
2068,1816,City of Neenah:Winnebago County,Patty Sturn,"[PSTURN@CI.NEENAH.WI.US, scheslock@ci.neenah.w...",[(920)886-6109],,Winnebago County,"Po Box 426, Neenah, Wi 54956-0426",,Wisconsin,,"211 Walnut St, Neenah, Wi 54956-3026",City of Neenah,,,
3444,678,Marion Township:Sanilac County,Deborah G Williamson,[],[],[],Sanilac County,,,Michigan,,,Marion Township,,,
4361,60,St. Lucie County,Gertrude Walker,[elections@slcelections.com],,,St. Lucie County,,,Florida,,,,http://www.slcelections.com,,


In [11]:
def non_empty_element(el_list):
    if type(el_list) == float:
        return False
    values = [len(x.strip()) > 0 for x in el_list if x]
    if len(values) == 0:
        return False
    return any(values)

def not_null_empty(series):
    return (series.notnull() & series.apply(lambda x: x != ''))

email = df['emails'].apply(non_empty_element)
fax = df['faxes'].apply(non_empty_element)
city = not_null_empty(df['city'])
county = not_null_empty(df['county'])

df_valid = pd.DataFrame({
    'official': df['official'].notnull(),
    'locale': not_null_empty(df['locale']),
    'city': city,
    'county': county,
    'city_county': (city | county),
    'email': email,
    'faxes': fax,
    'fax_email': (email | fax),
})

df_stat = df_valid.groupby(df['state']).mean()
df_stat['records'] = df.groupby('state').size()
df_stat.sort_index()

Unnamed: 0_level_0,official,locale,city,county,city_county,email,faxes,fax_email,records
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Florida,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,67
Georgia,1.0,1.0,0.0,1.0,1.0,0.968553,0.987421,0.993711,159
Maine,1.0,1.0,1.0,0.0,1.0,0.0,0.94246,0.94246,504
Maryland,0.916667,1.0,0.0,1.0,1.0,1.0,0.0,1.0,24
Michigan,1.0,1.0,1.0,1.0,1.0,0.981107,0.687296,0.988925,1535
Minnesota,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,87
Nebraska,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,93
Virginia,1.0,1.0,0.285714,0.714286,1.0,1.0,1.0,1.0,133
Wisconsin,1.0,1.0,1.0,0.969779,1.0,0.861846,0.546681,0.95089,1853


## Michigan

In [5]:
df_mi = df[df['state'] == 'Michigan']
df_mi['city'].value_counts().head()

Grant Township       11
Sherman Township      9
Lincoln Township      7
Lake Township         6
Garfield Township     6
Name: city, dtype: int64

However, locales are unique

In [6]:
assert((df_mi['city'] + ':' + df_mi['county'].fillna('') == df_mi['locale']).all())
df_mi['locale'].value_counts().head()

Weare Township:Oceana County        1
Litchfield City:Hillsdale County    1
Colfax Township:Mecosta County      1
Coleman City:Midland County         1
Evart Township:Osceola County       1
Name: locale, dtype: int64

## Wisconsin Unique Identifier

Wisonsin towns cannot be uniquely identified by city

In [7]:
df_wi = df[df['state'] == 'Wisconsin']
df_wi['city'].value_counts().head()

Town of Lincoln       12
Town of Washington     8
Town of Scott          7
Town of Union          7
Town of Grant          6
Name: city, dtype: int64

The locales are unique, but don't properly handle multi-county cities (blank entry)

In [8]:
assert((df_wi['city'] + ':' + df_wi['county'].fillna('') == df_wi['locale']).all())
df_wi['locale'].value_counts().head()

Town of Christiana:Vernon County     1
Village of Ingram:Rusk County        1
City of Sturgeon Bay:Door County     1
Town of Evergreen:Washburn County    1
Town of Theresa:Dodge County         1
Name: locale, dtype: int64

So we need to ensure that all multi-county cities are unique across the entire state

In [9]:
multi_county_cities = df_wi.loc[df_wi['county'].isnull(), 'city']
df_wi.loc[df_wi['city'].isin(multi_county_cities), 'city'].value_counts().head()

Village of Wrightstown       1
Village of De Soto           1
Village of Blanchardville    1
Village of Mukwonago         1
Village of Spring Valley     1
Name: city, dtype: int64