# Nutrition, Physical Activity, and Obesity - Behavioral Risk Factor Surveillance System

The data used here is from Data.gov, managed and hosted by the U.S. General Services Administration, Technology Transformation Service (https://catalog.data.gov)

The dataset includes data on adult's diet, physical activity, and weight status from Behavioral Risk Factor Surveillance System. This data is used for DNPAO's Data, ...

## Setup

First we need to import the some libraries

In [1]:
from copy import deepcopy
import json
import sys

from IPython.display import display
import matplotlib as mpl
import matplotlib.pyplot as plt
import networkx as nx

import urllib.request
from xnd import xnd

In [2]:
def walk(G, node):
    if isinstance(node, dict):
        for key, item in node.items():
            if isinstance(item, dict):
                for j in item.keys():
                    G.add_edge(key, j)
                walk(G, item)
    if isinstance(node, list):
        walk(G, node[0])
    return
            
def gen_graph(data):
    G = nx.Graph()
    
    for key in data.keys():
        G.add_edge('ROOT', key)

    walk(G, data)
    
    return G

In [3]:
def get_key_value(item):
    if isinstance(item, dict):
        return item.items()
    elif isinstance(item, list):
        return enumerate(item)
    return ()

In [4]:
def highest_value_type(values):
    """
    TODO: implement a way to treat "type weight"
    
    values: list
    returns: type
    """
    _types = {type(v) for v in values if v is not None}
    
    if not _types:
        return None
    return _types.pop()

# highest_value_type([1,3,4,5, None])

In [5]:
def highest_type(values):
    """
    TODO: implement a way to treat "type weight"
    
    values: list
    returns: type
    """
    _types = {v for v in values if v is not type(None)}
    
    if not _types:
        return type(None)
    return _types.pop()

# highest_type([int, int ,int , str, type(None)])    

In [6]:
def apply_nullable_on_children(parent):
    if isinstance(parent, dict): 
        for k, child in parent.items():
            child['nullable'] = True
            if 'value' in child:
                apply_nullable_on_children(child['value'])
    return parent

"""                 
apply_nullable_on_children({'tree': {'value_type': dict,
  'nullable': False,
  'value': {'leaf1': {'value_type': list,
    'nullable': False,
    'count': 2,
    'value': int},
   'leaf2': {'value_type': list,
    'nullable': False,
    'count': 2,
    'value': int}}},
 'cars': {'value_type': list,
  'nullable': False,
  'count': 3,
  'value': {'model': {'nullable': False, 'value_type': str},
   'color': {'nullable': False, 'value_type': str},
   'year': {'nullable': False, 'value_type': int},
   'ports': {'nullable': False,
    'value_type': dict,
    'value': {'numbers': {'value_type': int, 'nullable': False}}}}}})
""";

In [7]:
def json_cleaning(data):
    data = deepcopy(data)
    if isinstance(data, dict):
        for k in list(data.keys()):
            k_new = k.replace(' ', '_')
            
            if k != k_new:
                data[k_new] = data.pop(k)
            
            k = k_new
            if isinstance(data[k], (dict, list)):
                data[k] = json_cleaning(data[k])
                
    if isinstance(data, list):
        for i, v in enumerate(data):
            if isinstance(v, (dict, list)):
                data[i] = json_cleaning(data[i])
    return data

# json_cleaning({'k b': 2, 'a w': {'bla bla': 2, 'x x': [1, 2, {'asdf asdf ': 1}]}})

In [8]:
def json_meta_type(data, parent_nullable=False):
    """The function should return a json with all meta data normalized."""
    meta = {}

    for k, item in get_key_value(data):
        meta[k] = {'value_type': item.__class__}
        meta_k = meta[k]  # alias
        
        if meta_k['value_type'] is type(None) or parent_nullable:
            meta_k['nullable'] = True
        else:
            meta_k['nullable'] = False
        
        if isinstance(item, dict):
            meta_k['value'] = json_meta_type(item, meta_k['nullable'])
        elif isinstance(item, list):
            meta_k['count'] = len(item)
            _value_highest_type = highest_value_type(item)
            
            if _value_highest_type is dict:
                meta_k['value'] = {}
                meta_k_value = meta_k['value']  # alias
                # first_iter and columns are used to find nullable fields
                first_iter = True
                columns = set()
            
            for v in item:
                if _value_highest_type != type(v):
                    raise Exception('All items from the list should be the same type.')
                    
                if isinstance(v, dict):
                    children_value = json_meta_type(v, meta_k['nullable'])
                    children_keys = v.keys()
                    
                    for _k, _v in children_value.items():
                        if parent_nullable or (not first_iter and _k not in columns):
                            nullable = True
                        else:
                            nullable = False
                            
                        if (
                            _k not in meta_k_value or 
                            not _v['value_type'] is type(None)
                        ):
                            if _k not in meta_k_value:
                                meta_k_value[_k] = {
                                    'nullable': False,
                                    'value_type': type(None)
                                }
                            _nullable = meta_k_value[_k]['nullable']
                            _v['value_type'] = highest_type([meta_k_value[_k]['value_type'], _v['value_type']])
                            meta_k_value[_k].update(_v)
                            meta_k_value[_k].update({'nullable': meta_k_value[_k]['nullable'] | _nullable})
                            
                        if _v['value_type'] is type(None) or nullable:
                            meta_k_value[_k]['nullable'] = True
                            if 'value' in meta_k_value[_k] and isinstance(meta_k_value[_k], dict):
                                apply_nullable_on_children(meta_k_value[_k]['value'])

                    for c in columns:
                        if parent_nullable or c not in children_keys:
                            meta_k_value[c]['nullable'] = True
                            if 'value' in meta_k_value[_k] and isinstance(meta_k_value[_k], dict):
                                apply_nullable_on_children(meta_k_value[_k]['value'])
                    
                    columns |= set(children_keys)
                            
                first_iter = False
                    
            if 'value' not in meta_k:
                meta_k['value'] = _value_highest_type
    return meta
# json_meta_type(test_data)

In [9]:
def map_type(python_type):
    types = {
        'str': 'string',
        'int': 'int64'
    }
    
    if python_type in types:
        return types[python_type]
    else:
        return python_type

In [10]:
def get_xnd_type(json_meta):
    _xnd_type_value = []
    
    if isinstance(json_meta, dict):
        _xnd_type_template = '{{{}}}'
    elif isinstance(json_meta, list):
        _xnd_type_template = '[{}]'
    else:
        _xnd_type_template = '{}'
    
    for k, item in get_key_value(json_meta):
        nullable_str = '?' if 'nullable' in item and item['nullable'] else ''
        
        if item['value_type'] is dict:
            if isinstance(item['value'], dict):
                _xnd_item_value = get_xnd_type(item['value'])
            else:
                _xnd_item_value = item['value'].__name__
            _xnd_item_template = '{}: {}{{}}'.format(k, nullable_str)
            
        elif item['value_type'] is list:
            if isinstance(item['value'], dict):
                _xnd_item_value = get_xnd_type(item['value'])
            else:
                _xnd_item_value = map_type(item['value'].__name__)
            _xnd_item_template = '{}: {} * {}{{}}'.format(k, item['count'], nullable_str)
        else:
            _xnd_item_value = map_type(item['value_type'].__name__)
            _xnd_item_template = '{}: {}{{}}'.format(k, nullable_str)
        
        _xnd_type_value.append(_xnd_item_template.format(_xnd_item_value))
    return _xnd_type_template.format(', '.join(_xnd_type_value))

In [11]:
def normalize_json(data, meta=None):
    data = deepcopy(data)
    
    if meta is None:
        meta = json_meta_type(data)
    
    if isinstance(meta, dict):
        for k_meta, item_meta in meta.items():
            if isinstance(item_meta, dict):
                if 'value_type' not in item_meta:
                    for k, v in item_meta.items():
                        if k not in data:
                            data[k] = None
                elif item_meta['value_type'] is dict:
                    if k_meta not in data:
                        data[k_meta] = None
                elif item_meta['value_type'] is list:
                    for k_data, item_data in get_key_value(data[k_meta]):
                        if isinstance(item_data, dict):
                            data[k_meta][k_data].update(normalize_json(item_data, item_meta))
    return data

In [12]:
test_data = {
    'tree': {
        'leaf1': [1, 2],
        'leaf2': [2, 4]}, 
    'cars': [
        {'model': 'model1', 'year': 2010, 'ports': None},
        {'model': 'model2', 'color': 'blue'},
        {'ports': {'numbers': 4}}
    ]
}

In [13]:
new_test_data = json_cleaning(test_data)

In [14]:
meta_type = json_meta_type(new_test_data)
meta_type

{'tree': {'value_type': dict,
  'nullable': False,
  'value': {'leaf1': {'value_type': list,
    'nullable': False,
    'count': 2,
    'value': int},
   'leaf2': {'value_type': list,
    'nullable': False,
    'count': 2,
    'value': int}}},
 'cars': {'value_type': list,
  'nullable': False,
  'count': 3,
  'value': {'model': {'nullable': True, 'value_type': str},
   'year': {'nullable': True, 'value_type': int},
   'ports': {'nullable': True,
    'value_type': dict,
    'value': {'numbers': {'value_type': int, 'nullable': True}}},
   'color': {'nullable': True, 'value_type': str}}}}

In [15]:
test_type = get_xnd_type(meta_type)
test_type

'{tree: {leaf1: 2 * int64, leaf2: 2 * int64}, cars: 3 * {model: ?string, year: ?int64, ports: ?{numbers: ?int64}, color: ?string}}'

In [16]:
xnd(normalize_json(test_data), type=test_type)

xnd({'tree': {'leaf1': [1, 2], 'leaf2': [2, 4]},
     'cars': [{'model': 'model1', 'year': 2010, 'ports': None, 'color': None},
      {'model': 'model2', 'year': None, 'ports': None, 'color': 'blue'},
      {'model': None, 'year': None, 'ports': {'numbers': 4}, 'color': None}]},
    type='{tree : {leaf1 : 2 * int64, leaf2 : 2 * int64}, cars : 3 * {model : ?string, year : ?int64, ports : ?{numbers : ?int64}, color : ?string}}')

## Loading Data

The data for this tutorial could be downloaded at https://chronicdata.cdc.gov/views/hn4x-zwk7/rows.json?accessType=DOWNLOAD

We already have this data at `data/data-gov-nutrition.json`

In [17]:
with open('data/data-gov-nutrition.json') as f:
    data = json_cleaning(json.load(f))

In [18]:
str_summary = str(data)[:500] + '...'
print(str_summary.replace(', ', ',\n'))

{'meta': {'view': {'id': 'hn4x-zwk7',
'name': 'Nutrition,
Physical Activity,
and Obesity - Behavioral Risk Factor Surveillance System',
'attribution': 'Centers for Disease Control and Prevention (CDC),
National Center for Chronic Disease Prevention and Health Promotion,
Division of Nutrition,
Physical Activity,
and Obesity',
'attributionLink': 'http://www.cdc.gov/nccdphp/DNPAO/index.html',
'averageRating': 0,
'category': 'Nutrition,
Physical Activity,
and Obesity',
'createdAt': 1469123618,
'desc...


In [19]:
try:
    xnd(data)
except Exception as e:
    display(e)

ValueError('dtype mismatch: have {id: int64, name: string, dataTypeName: string, fieldName: string, position: int64, renderTypeName: string, format: {}, flags: 1 * string} and {id: int64, name: string, dataTypeName: string, description: string, fieldName: string, position: int64, renderTypeName: string, tableColumnId: int64, width: int64, cachedContents: {largest: string, non_null: int64, average: string, null: int64, top: 6 * {item: string, count: int64}, smallest: string, sum: string}, format: {precisionStyle: string, noCommas: string, align: string}}')

In [20]:
meta = json_meta_type(data)
xnd_type = get_xnd_type(meta)
new_data = normalize_json(data)
xnd(new_data)

ValueError: dtype mismatch: have {id: int64, name: string, dataTypeName: string, fieldName: string, position: int64, renderTypeName: string, format: {}, flags: 1 * string} and {id: int64, name: string, dataTypeName: string, description: string, fieldName: string, position: int64, renderTypeName: string, tableColumnId: int64, width: int64, cachedContents: {largest: string, non_null: int64, average: string, null: int64, top: 6 * {item: string, count: int64}, smallest: string, sum: string}, format: {precisionStyle: string, noCommas: string, align: string}}

In [None]:
G = gen_graph(new_data)

In [None]:
plt.figure(figsize=(10, 15))
nx.draw(
    G, with_labels=True, node_color="#dfdfdf", font_size=10,  
    pos=nx.spring_layout(G, scale=.5, k=0.5)
)

In [None]:
xnd(new_data, type=xnd_type)