In [1]:
%load_ext autoreload
%autoreload 2

import re
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import matplotlib
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 12, 6
matplotlib.rcParams['font.size'] = 18  # Probably OS Dependent

In [3]:
data_files = glob('../public/*.json')

dfs = []
for file_ in data_files:
    df = pd.read_json(file_)
    df['state'] = re.search('public/(.+).json', file_).group(1).title()
    dfs += [df]
    
df = pd.concat(dfs).reset_index()

def empty_null(el_list):
    if type(el_list) != list:
        return []
    return el_list
df['emails'] = df['emails'].apply(empty_null)
df['faxes'] = df['faxes'].apply(empty_null)

df.sample(5, random_state=422)

Unnamed: 0,index,locale,official,emails,faxes,phones,county,address,party,state,contacts,physicalAddress,city,url,faxs,urls
1357,1105,Town of Three Lakes:Oneida County,Susan L Harris,[TOWNCLERKSUE@TOWNOFTHREELAKES.COM],[(715)546-3384],,Oneida County,"Po Box 565, Three Lakes, Wi 54562-0565",,Wisconsin,,"6965 W School St, Three Lakes, Wi 54562",Town of Three Lakes,,,
1681,1429,Village of Radisson:Sawyer County,Gwen Genari,[vclerk@bevcomm.net],[],,Sawyer County,"Po Box 127, Radisson, Wi 54867-0127",,Wisconsin,,"10598 W Railroad St, Radisson, Wi 54867",Village of Radisson,,,
2068,1816,City of Neenah:Winnebago County,Patty Sturn,[scheslock@ci.neenah.wi.us; PSTURN@CI.NEENAH.W...,[(920)886-6109],,Winnebago County,"Po Box 426, Neenah, Wi 54956-0426",,Wisconsin,,"211 Walnut St, Neenah, Wi 54956-3026",City of Neenah,,,
3444,678,Marion Township:Sanilac County,Deborah G Williamson,[],[],[(810) 376-8229],Sanilac County,,,Michigan,,,Marion Township,,,
4361,60,St. Lucie County,Gertrude Walker,[elections@slcelections.com],[],,St. Lucie County,,,Florida,,,,http://www.slcelections.com,,


In [4]:
def non_empty_element(el_list):
    values = [len(x.strip()) > 0 for x in el_list if x]
    if len(values) == 0:
        return False
    return any(values)

def not_null_empty(series):
    return (series.notnull() & series.apply(lambda x: x != ''))

email = df['emails'].apply(empty_null).apply(non_empty_element)
fax = df['faxes'].apply(empty_null).apply(non_empty_element)
city = not_null_empty(df['city'])
county = not_null_empty(df['county'])

df_valid = pd.DataFrame({
    'official': df['official'].notnull(),
    'locale': not_null_empty(df['locale']),
    'city': city,
    'county': county,
    'city_county': (city | county),
    'email': email,
    'faxes': fax,
    'fax_email': (email | fax),
})

df_stat = df_valid.groupby(df['state']).mean()
df_stat.loc[:, 'records'] = df.groupby('state').size()

df_len = df[['state']]
df_len.loc[:, 'emails'] = df['emails'].apply(lambda x: len(x))
df_len.loc[:, 'faxes'] = df['faxes'].apply(lambda x: len(x))
df_stat.loc[:, 'email_max'] = df_len.groupby('state')['emails'].max()
df_stat.loc[:, 'fax_max'] = df_len.groupby('state')['faxes'].max()
df_stat.sort_index()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0_level_0,official,locale,city,county,city_county,email,faxes,fax_email,records,email_max,fax_max
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Florida,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,67,1,0
Georgia,1.0,1.0,0.0,1.0,1.0,0.968553,0.987421,0.993711,159,1,1
Maine,1.0,1.0,1.0,0.0,1.0,0.0,0.94246,0.94246,504,0,1
Maryland,0.916667,1.0,0.0,1.0,1.0,1.0,0.0,1.0,24,4,0
Michigan,1.0,1.0,1.0,1.0,1.0,0.981107,0.687296,0.988925,1535,1,1
Minnesota,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,87,1,1
Nebraska,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,93,1,1
Virginia,1.0,1.0,0.285714,0.714286,1.0,1.0,1.0,1.0,133,1,1
Wisconsin,1.0,1.0,1.0,0.969779,1.0,0.861846,0.546681,0.95089,1853,2,1


## Michigan

In [5]:
df_mi = df[df['state'] == 'Michigan']
df_mi['city'].value_counts().head()

Grant Township       11
Sherman Township      9
Lincoln Township      7
Lake Township         6
Garfield Township     6
Name: city, dtype: int64

However, locales are unique

In [6]:
assert((df_mi['city'] + ':' + df_mi['county'].fillna('') == df_mi['locale']).all())
df_mi['locale'].value_counts().head()

Stanton City:Montcalm County        1
Casco Township:Allegan County       1
Hudson Township:Mackinac County     1
Volinia Township:Cass County        1
Coldwater Township:Branch County    1
Name: locale, dtype: int64

## Wisconsin Unique Identifier

Wisonsin towns cannot be uniquely identified by city

In [7]:
df_wi = df[df['state'] == 'Wisconsin']
df_wi['city'].value_counts().head()

Town of Lincoln       12
Town of Washington     8
Town of Union          7
Town of Scott          7
Town of Grant          6
Name: city, dtype: int64

The locales are unique, but don't properly handle multi-county cities (blank entry)

In [8]:
assert((df_wi['city'] + ':' + df_wi['county'].fillna('') == df_wi['locale']).all())
df_wi['locale'].value_counts().head()

Village of Yuba:Richland County            1
Town of Tipler:Florence County             1
Village of St. Cloud:Fond Du Lac County    1
City of Marinette:Marinette County         1
Town of Lima:Rock County                   1
Name: locale, dtype: int64

So we need to ensure that all multi-county cities are unique across the entire state

In [9]:
multi_county_cities = df_wi.loc[df_wi['county'].isnull(), 'city']
df_wi.loc[df_wi['city'].isin(multi_county_cities), 'city'].value_counts().head()

Village of Bayside           1
Village of Blanchardville    1
Village of Birnamwood        1
Village of Unity             1
Village of Newburg           1
Name: city, dtype: int64