In [1]:
%load_ext autoreload
%autoreload 2

import re
from glob import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import matplotlib
plt.style.use('ggplot')
matplotlib.rcParams['figure.figsize'] = 12, 6
matplotlib.rcParams['font.size'] = 18  # Probably OS Dependent

In [3]:
data_files = glob('../public/*.json')

def non_empty_element(el_list):
    values = [len(x.strip()) > 0 for x in el_list if x]
    if len(values) == 0:
        return False
    return any(values)

def not_null_empty(series):
    return (series.notnull() & series.apply(lambda x: x != ''))
    

data = []
for file_ in data_files:
    df = pd.read_json(file_)
    false_series = pd.Series(False, index=df.index)
    
    email = df['emails'].apply(non_empty_element) if 'emails' in df else false_series
    fax = df['faxes'].apply(non_empty_element) if 'faxes' in df else false_series
    
    city = not_null_empty(df['city']) if 'city' in df else false_series
    county = not_null_empty(df['county']) if 'county' in df else false_series

    data += [{
        'state': re.search('public/(.+).json', file_).group(1).title(),
        'records': df.shape[0],
        'fields': df.shape[1],
        'locale': not_null_empty(df['locale']).mean(),
        'city': city.mean(),
        'county': county.mean(),
        'official': not_null_empty(df['official']).mean(),
        'city_county': (city | county).mean(),
        'email': email.mean(),
        'faxes': fax.mean(),
        'fax_email': (email | fax).mean(),
    }]

pd.DataFrame(data)


Unnamed: 0,state,records,fields,locale,city,county,official,city_county,email,faxes,fax_email
0,Nebraska,93,8,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
1,Georgia,159,6,1.0,0.0,1.0,1.0,1.0,0.968553,0.987421,0.993711
2,Wisconsin,1853,9,1.0,1.0,0.969779,1.0,1.0,0.861846,0.546681,0.95089
3,Maine,504,6,1.0,1.0,0.0,1.0,1.0,0.0,0.94246,0.94246
4,Virginia,133,9,1.0,0.285714,0.714286,1.0,1.0,1.0,1.0,1.0
5,Maryland,24,8,1.0,0.0,1.0,0.916667,1.0,1.0,0.0,1.0
6,Michigan,1535,7,1.0,1.0,1.0,1.0,1.0,0.981107,0.687296,0.988925
7,Florida,67,5,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0
8,Minnesota,87,6,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0
