In [7]:
%load_ext autoreload
%autoreload 2

import re
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import matplotlib
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 12, 6
matplotlib.rcParams['font.size'] = 18  # Probably OS Dependent

In [11]:
data_files = glob('../public/*.json')

def non_empty_element(el_list):
    values = [len(x.strip()) > 0 for x in el_list if x]
    if len(values) == 0:
        return False
    return any(values)

def not_null_empty(series):
    return (series.notnull() & series.apply(lambda x: x != ''))
    

data = []
for file_ in data_files:
    df = pd.read_json(file_)
    false_series = pd.Series(False, index=df.index)
    
    email = df['emails'].apply(non_empty_element) if 'emails' in df else false_series
    fax = df['faxes'].apply(non_empty_element) if 'faxes' in df else false_series
    
    city = not_null_empty(df['city']) if 'city' in df else false_series
    county = not_null_empty(df['county']) if 'county' in df else false_series

    data += [{
        'state': re.search('public/(.+).json', file_).group(1).title(),
        'official': not_null_empty(df['official']).mean(),
        'locale': not_null_empty(df['locale']).mean(),
        'city': city.mean(),
        'county': county.mean(),
        'city_county': (city | county).mean(),
        'emails': email.mean(),
        'faxes': fax.mean(),
        'emails_faxes': (email | fax).mean(),
        'records': df.shape[0],
        'fields': df.shape[1],
    }]

pd.DataFrame(data).sort_values('state')


Unnamed: 0,state,official,locale,city,county,city_county,emails,faxes,emails_faxes,records,fields
7,Florida,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,67,5
1,Georgia,1.0,1.0,0.0,1.0,1.0,0.968553,0.987421,0.993711,159,6
3,Maine,1.0,1.0,1.0,0.0,1.0,0.0,0.94246,0.94246,504,6
5,Maryland,0.916667,1.0,0.0,1.0,1.0,1.0,0.0,1.0,24,8
6,Michigan,1.0,1.0,1.0,1.0,1.0,0.981107,0.687296,0.988925,1535,7
8,Minnesota,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,87,6
0,Nebraska,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,93,8
4,Virginia,1.0,1.0,0.285714,0.714286,1.0,1.0,1.0,1.0,133,9
2,Wisconsin,1.0,1.0,1.0,0.969779,1.0,0.861846,0.546681,0.95089,1853,9
