In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
from fiona.crs import from_epsg

In [2]:
import urllib, json, requests 
import geojson

In [3]:
from ipyleaflet import Map, GeoData, GeoJSON, basemaps, basemap_to_tiles, Icon, Circle, Marker, LayerGroup, WidgetControl
import ipywidgets as widgets
from ipywidgets import Button 
from IPython.display import display, clear_output

In [4]:
#need this to stop numpy from returning truncated arrays 
import sys
np.set_printoptions(threshold=sys.maxsize)
# for automatic linebreaks and multi-line cells.
pd.options.display.max_colwidth = 10000

In [5]:
#drawing basic map
center = (40.7210907,-73.9877836)
basemap = basemap_to_tiles(basemaps.CartoDB.Positron)

m = Map(layers=(basemap, ), center=center, zoom=15, min_zoom = 7, max_zoom = 20)

In [6]:
def extract_location():       
    global gdf, lat, lon
    
    lat = str(markerlocation[0])
    lon = str(markerlocation[1])
    
    df2 = pd.DataFrame(markerlocation)
    df=df2.transpose()
    df.columns=['Latitude','Longitude']

    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude), crs='epsg:4326')
    
    return gdf

In [7]:
draggable=False
marker_opacity=1
icon = Icon(icon_url='icon.png', icon_size=[15, 15])

marker = Marker(location=center, draggable=draggable, icon=icon, opacity=marker_opacity)

markerlocation = marker.location 

layer_group = LayerGroup(layers=(marker, ))
m.add_layer(layer_group)
   
def update_marker(**kwargs):
    
    if kwargs.get('type') == 'click':
        layer_group.clear_layers();
        
        marker = Marker(location=kwargs.get('coordinates'), draggable=draggable, icon=icon, opacity=marker_opacity, options=['rise_on_hover'])  
        
        global markerlocation
        markerlocation = marker.location 
        
        layer_group.add_layer(marker)
    
        draw_update_buffer(**kwargs)
    
m.on_interaction(update_marker)

In [8]:
def draw_update_buffer(**kwargs):     
    m.on_interaction(update_marker)
    extract_location()
    
    global half_mi
    half_mi=gdf.copy()
    half_mi['geometry'] = half_mi.geometry.buffer(.004,  cap_style=1, join_style=1)

    map_extent = gdf.copy()
    map_extent['geometry'] = map_extent.buffer(1,  cap_style=1, join_style=1)

    diff = gpd.overlay(map_extent, half_mi, how='difference')
    
    half_mi_difference = GeoData(geo_dataframe = diff,
                       style={'color': "black", \
                              'fillColor': "#000000", \
                              'fillOpacity': .2, \
                              'opacity': 1, \
                              'weight': 2},
                       name = "Test", crs='epsg:4326')

    layer_group.add_layer(half_mi_difference) 

In [9]:
# m

In [10]:
def import_censustracts():
    draw_update_buffer()
    extract_location()
    
    bounding_box = half_mi.envelope
    df = gpd.GeoDataFrame(gpd.GeoSeries(bounding_box), columns=['geometry'])
    minx, miny, maxx, maxy = df.geometry.total_bounds
    bounds = minx, miny, maxx, maxy

    # census tracts link
    endpoint = 'https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/Tracts_Blocks/MapServer/4/query'
    s = requests.session()
    s.params = {
        'geometry': str(bounds),
        'geometryType': 'esriGeometryEnvelope',
        'inSR': 4326,
        'spatialRel': 'esriSpatialRelIntersects',
        'outFields': 'GEOID,STATE,COUNTY,TRACT,NAME,STGEOMETRY,OBJECTID',
        'returnGeometry': True,
        'f': 'geojson',        
    }
    start = 0
    done = False
    features = []
    crs = None
    while not done:
        r = s.get(endpoint, params={
            'resultOffset': start,
            'resultRecordCount': 32,
        })
        censusgeo = geojson.loads(r.text)
        newfeats = censusgeo.__geo_interface__['features']
        if newfeats:
            features.extend(newfeats)
            crs=censusgeo.__geo_interface__['crs']
            start += len(newfeats)
            print("Received", len(newfeats), "entries,", start, "total")
        else:
            done = True
    
    global tracts
    tracts = gpd.GeoDataFrame.from_features(features, crs=crs)
    return tracts

import_censustracts()

Received 12 entries, 12 total
Received 1 entries, 13 total


Unnamed: 0,geometry,GEOID,STATE,COUNTY,TRACT,NAME,OBJECTID
0,"POLYGON ((-73.98986 40.72053, -73.98962 40.72046, -73.98911 40.72031, -73.98835 40.72008, -73.98754 40.71983, -73.98675 40.71959, -73.98591 40.71934, -73.98507 40.71908, -73.98448 40.72024, -73.98382 40.72147, -73.98468 40.72173, -73.98552 40.72198, -73.98628 40.72224, -73.98714 40.72247, -73.98786 40.72270, -73.98864 40.72293, -73.98923 40.72168, -73.98986 40.72053))",36061003001,36,61,3001,Census Tract 30.01,10843
1,"POLYGON ((-73.99326 40.72235, -73.99352 40.72163, -73.99273 40.72140, -73.99224 40.72125, -73.99151 40.72103, -73.99063 40.72077, -73.98986 40.72053, -73.98923 40.72168, -73.98864 40.72293, -73.98939 40.72317, -73.99027 40.72343, -73.99102 40.72365, -73.99126 40.72372, -73.99260 40.72414, -73.99262 40.72410, -73.99309 40.72282, -73.99326 40.72235))",36061003601,36,61,3601,Census Tract 36.01,10846
2,"POLYGON ((-73.99155 40.72709, -73.99179 40.72639, -73.98987 40.72558, -73.98846 40.72499, -73.98826 40.72490, -73.98750 40.72458, -73.98705 40.72520, -73.98662 40.72580, -73.98618 40.72641, -73.98575 40.72699, -73.98531 40.72760, -73.98488 40.72819, -73.98724 40.72919, -73.98891 40.72991, -73.98948 40.73013, -73.98965 40.72988, -73.98973 40.72977, -73.98990 40.72956, -73.99036 40.72893, -73.99079 40.72835, -73.99108 40.72800, -73.99129 40.72776, -73.99136 40.72756, -73.99155 40.72709))",36061003800,36,61,3800,Census Tract 38,10847
3,"POLYGON ((-73.98788 40.71741, -73.98837 40.71645, -73.98753 40.71620, -73.98672 40.71595, -73.98587 40.71570, -73.98580 40.71568, -73.98501 40.71544, -73.98454 40.71639, -73.98423 40.71696, -73.98417 40.71707, -73.98414 40.71713, -73.98407 40.71726, -73.98404 40.71733, -73.98399 40.71741, -73.98340 40.71857, -73.98423 40.71883, -73.98507 40.71908, -73.98591 40.71934, -73.98675 40.71959, -73.98736 40.71837, -73.98743 40.71821, -73.98751 40.71807, -73.98788 40.71741))",36061001402,36,61,1402,Census Tract 14.02,26519
4,"POLYGON ((-73.98845 40.72328, -73.98864 40.72293, -73.98786 40.72270, -73.98714 40.72247, -73.98628 40.72224, -73.98552 40.72198, -73.98468 40.72173, -73.98382 40.72147, -73.98344 40.72202, -73.98296 40.72267, -73.98529 40.72365, -73.98750 40.72458, -73.98797 40.72394, -73.98845 40.72328))",36061003002,36,61,3002,Census Tract 30.02,40986
5,"POLYGON ((-73.98454 40.71639, -73.98501 40.71544, -73.98426 40.71521, -73.98334 40.71493, -73.98300 40.71484, -73.98245 40.71470, -73.98234 40.71466, -73.98165 40.71446, -73.98092 40.71426, -73.97996 40.71397, -73.97950 40.71489, -73.97922 40.71540, -73.97913 40.71555, -73.97910 40.71561, -73.97902 40.71575, -73.97899 40.71580, -73.97891 40.71595, -73.97909 40.71601, -73.98002 40.71629, -73.98058 40.71646, -73.98144 40.71672, -73.98229 40.71698, -73.98310 40.71718, -73.98248 40.71829, -73.98340 40.71857, -73.98399 40.71741, -73.98404 40.71733, -73.98407 40.71726, -73.98414 40.71713, -73.98417 40.71707, -73.98423 40.71696, -73.98454 40.71639))",36061001200,36,61,1200,Census Tract 12,45321
6,"POLYGON ((-73.99233 40.72491, -73.99260 40.72414, -73.99126 40.72372, -73.99102 40.72365, -73.99027 40.72343, -73.98939 40.72317, -73.98864 40.72293, -73.98845 40.72328, -73.98797 40.72394, -73.98750 40.72458, -73.98826 40.72490, -73.98846 40.72499, -73.98987 40.72558, -73.99179 40.72639, -73.99206 40.72564, -73.99213 40.72545, -73.99221 40.72523, -73.99233 40.72491))",36061003602,36,61,3602,Census Tract 36.02,52962
7,"POLYGON ((-73.98705 40.72520, -73.98750 40.72458, -73.98529 40.72365, -73.98296 40.72267, -73.98252 40.72328, -73.98208 40.72388, -73.98164 40.72449, -73.98121 40.72507, -73.98077 40.72568, -73.98033 40.72628, -73.97991 40.72687, -73.98223 40.72785, -73.98266 40.72726, -73.98488 40.72819, -73.98531 40.72760, -73.98575 40.72699, -73.98618 40.72641, -73.98662 40.72580, -73.98705 40.72520))",36061003200,36,61,3200,Census Tract 32,56775
8,"POLYGON ((-73.99750 40.71407, -73.99744 40.71407, -73.99648 40.71411, -73.99484 40.71421, -73.99466 40.71422, -73.99450 40.71423, -73.99435 40.71425, -73.99414 40.71427, -73.99395 40.71429, -73.99302 40.71436, -73.99285 40.71437, -73.99256 40.71439, -73.99213 40.71442, -73.99123 40.71449, -73.99075 40.71455, -73.99022 40.71440, -73.99022 40.71453, -73.99020 40.71466, -73.98975 40.71555, -73.98915 40.71671, -73.98995 40.71697, -73.99070 40.71720, -73.99118 40.71734, -73.99142 40.71742, -73.99222 40.71766, -73.99309 40.71792, -73.99383 40.71814, -73.99481 40.71846, -73.99543 40.71728, -73.99606 40.71623, -73.99653 40.71547, -73.99681 40.71504, -73.99709 40.71462, -73.99750 40.71407))",36061001600,36,61,1600,Census Tract 16,71680
9,"POLYGON ((-73.99442 40.71939, -73.99481 40.71846, -73.99383 40.71814, -73.99309 40.71792, -73.99222 40.71766, -73.99142 40.71742, -73.99118 40.71734, -73.99070 40.71720, -73.98995 40.71697, -73.98915 40.71671, -73.98837 40.71645, -73.98788 40.71741, -73.98751 40.71807, -73.98743 40.71821, -73.98736 40.71837, -73.98675 40.71959, -73.98754 40.71983, -73.98835 40.72008, -73.98911 40.72031, -73.98962 40.72046, -73.98986 40.72053, -73.99063 40.72077, -73.99151 40.72103, -73.99224 40.72125, -73.99273 40.72140, -73.99352 40.72163, -73.99367 40.72126, -73.99380 40.72094, -73.99403 40.72032, -73.99438 40.71952, -73.99442 40.71939))",36061001800,36,61,1800,Census Tract 18,71681


In [11]:
def download_acs():  
    state = tracts["STATE"].unique().tolist()
    state = ', '.join(map(str, state)).replace(" ", "")

    tract = tracts["TRACT"].unique().tolist()
    tract = ', '.join(map(str, tract)).replace(" ", "") 

    county = tracts["COUNTY"].unique().tolist()
    county = ', '.join(map(str, county)).replace(" ", "") 

    api_key = '9330dc4bf086a84f19fb412bb15f232507301de6'
    acs_url = f'https://api.census.gov/data/2018/acs/acs5/subject/'
    
    global acs_variables
    acs_variables_initial = 'S1603_C02_002E,S1603_C02_003E,S1603_C02_004E,S1603_C04_002E,S1603_C04_003E,S1603_C04_004E,S1601_C01_005E,S1601_C01_006E,S1601_C01_007E,S1601_C01_009E,S1601_C01_010E,S1601_C01_011E,S1601_C01_013E,S1601_C01_014E,S1601_C01_015E,S1601_C01_017E,S1601_C01_018E,S1601_C01_019E,S1901_C01_002E,S1901_C01_003E,S1901_C01_004E,S1901_C01_005E,S1901_C01_006E,S1901_C01_007E,S1901_C01_008E,S1901_C01_009E,S1901_C01_010E,S1901_C01_011E,S1901_C04_002E,S1901_C04_003E,S1901_C04_004E,S1901_C04_005E,S1901_C04_006E,S1901_C04_007E,S1901_C04_008E,S1901_C04_009E,S1901_C04_010E,S1901_C04_011E'
    acs_variables_additional = 'S1501_C01_002E,S1501_C01_004E,S1501_C01_003E,S1501_C01_005E,S1501_C01_017E,S1501_C01_018E,S1501_C01_020E,S1501_C01_021E,S1501_C01_023E,S1501_C01_024E,S1501_C01_025E,S1501_C01_026E,S1501_C03_002E,S1501_C03_003E,S1501_C03_004E,S1501_C03_005E,S1501_C03_017E,S1501_C03_018E,S1501_C03_020E,S1501_C03_021E,S1501_C03_023E,S1501_C03_024E,S1501_C03_026E,S1501_C03_027E,S1501_C05_002E,S1501_C05_003E,S1501_C05_004E,S1501_C05_005E,S1501_C05_017E,S1501_C05_018E,S1501_C05_020E,S1501_C05_021E,S1501_C05_023E,S1501_C05_024E,S1501_C05_026E,S1501_C05_027E,S1401_C01_030E,S1401_C01_032E,S1401_C01_034E,S1101_C01_003E,S1101_C05_001E'
    acs_variables = acs_variables_initial + "," + acs_variables_additional
    
    get_acs_initial = f'{acs_url}?&get={acs_variables_initial}&for=tract:{tract}&in=state:{state}%20county:{county}&key={api_key}'
    get_acs_additional = f'{acs_url}?&get={acs_variables_additional}&for=tract:{tract}&in=state:{state}%20county:{county}&key={api_key}'

    data_acs_initial=requests.get(get_acs_initial).json()
    data_acs_additional=requests.get(get_acs_additional).json()
    
    global acs
    acs_initial=pd.DataFrame(data_acs_initial[1:], columns=data_acs_initial[0])
    acs_additional=pd.DataFrame(data_acs_additional[1:], columns=data_acs_additional[0])

    acs=pd.merge(acs_initial, acs_additional, on='tract', how='left')

download_acs()

In [12]:
#any null rows?
# test = acs.columns[acs.isnull().any()]
# test

In [13]:
# acs = acs.drop(['S0901_C01_033E', 'S0901_C01_034E', 'S0902_C01_003E', 'S0902_C01_004E','S0902_C01_005E'], axis=1)

In [14]:
def clean_combine_census_and_geographic_data():
    import_censustracts()
    download_acs()
    
    global acs_site_sum, acs_site
    tracts["area"]=tracts.area
    acs_tracts = pd.merge(tracts, acs, left_on='TRACT', right_on='tract', how='left')
    
    acs_site = gpd.overlay(half_mi, acs_tracts, how='intersection')
    acs_site["area_clipped"]=acs_site.area 
    acs_site["ratio"] = acs_site["area_clipped"]/acs_site["area"]
    
    cols = acs_variables.split(",")
    acs_site[cols] = acs_site[cols].apply(pd.to_numeric, errors='coerce', axis=1)
    
    # if 'area_clipped' not in cols:
    #     cols.append("area_clipped")
    
    temp_df = acs_site[cols]    
    temp_df = temp_df.mul(acs_site.ratio, 0)
    acs_site.update(temp_df)

    acs_site_sum = pd.DataFrame(acs_site[cols].sum())

    acs_site_sum.reset_index(inplace=True)
    acs_site_sum.columns = ['variables', 'sum_in_area']
    
clean_combine_census_and_geographic_data()

Received 12 entries, 12 total
Received 1 entries, 13 total


In [15]:
data_dict = pd.read_csv("data-dictionary.csv")
data_dict

Unnamed: 0,sex,age_group,variable_group,variables,variable_name,ages
0,Male Female Both,5 to 17 years,Language Spoken At Home,S1603_C02_002E,Speak Only English at Home,"5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17"
1,Male Female Both,18 to 64 years,Language Spoken At Home,S1603_C02_003E,Speak Only English at Home,"18, 19, 20, 21, 22, 23, 24, 25,26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64"
2,Male Female Both,65 years and over,Language Spoken At Home,S1603_C02_004E,Speak Only English at Home,"64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100"
3,Male Female Both,5 to 17 years,Language Spoken At Home,S1601_C01_005E,Spanish,"5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17"
4,Male Female Both,18 to 64 years,Language Spoken At Home,S1601_C01_006E,Spanish,"18, 19, 20, 21, 22, 23, 24, 25,26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64"
...,...,...,...,...,...,...
71,Both,18 to 24 years,School Enrollment,S1401_C01_030E,Enrolled in college or graduate school,"18, 19, 20, 21, 22, 23, 24"
72,Male,18 to 24 years,School Enrollment,S1401_C01_032E,Enrolled in college or graduate school,"18, 19, 20, 21, 22, 23, 24"
73,Female,18 to 24 years,School Enrollment,S1401_C01_034E,Enrolled in college or graduate school,"18, 19, 20, 21, 22, 23, 24"
74,Male Female Both,all ages,Households and Families,S1101_C01_003E,Family households,"5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99"


In [16]:
ALL = 'ALL'
def user_options_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    unique.remove('Both')
    unique.remove('Male Female Both')
    return unique
# 
user_options_sorted_values_plus_ALL(data_dict.sex)

['ALL', 'Female', 'Male']

In [17]:
def user_options_sorted_values(array):
    unique = array.unique().tolist()
    unique.sort()
    return unique

In [18]:
output = widgets.Output()

In [19]:
data_output = widgets.Output()

In [20]:
def user_selection():
    
#     output.clear_output()
#     data_output.clear_output() 
    
    global selected_age, selected_gender, selected_percentile, text_generation_button, selection_filter, variable_inputs
#     selected_age = widgets.Dropdown(options = user_options_sorted_values(data_dict.age_group),\
#                                     value = "5 to 17 Years")
    selected_age = widgets.BoundedIntText(min=5, max=99, value=25, step=1, description='Age:')
    selected_gender = widgets.ToggleButtons(options = user_options_sorted_values_plus_ALL(data_dict.sex),\
                                            value = "Male",\
                                            description='Sex:', \
                                            disabled=False, button_style='', )
    selected_percentile = widgets.IntSlider(min=0, max=100, step=10, value=50, description='Percentile:',)
    text_generation_button = Button(description="Generate Text")

    #     display(selected_age, selected_gender, selected_percentile, output)
        
    selected_age_str= str(selected_age.value)
  
    if (selected_gender.value == 'ALL'):
        selection_filter = data_dict[(data_dict.ages.str.contains(selected_age_str)) & \
                              (data_dict.sex.str.contains('Both'))]
        
    else:     
        selection_filter = data_dict[(data_dict.ages.str.contains(selected_age_str)) & \
                              (data_dict.sex.str.contains(selected_gender.value))]
#                              | (data_dict.sex == "Both"))] 
#     with data_output:
#             display(selection_filter)
    
    def selection_filtering(age_group, sex):
#         output.clear_output()
#         data_output.clear_output()
                
        selected_age_str= str(selected_age.value)
        
        if (selected_gender.value == 'ALL'):
            selection_filter = data_dict[(data_dict.ages.str.contains(selected_age_str)) & \
                              (data_dict.sex.str.contains('Both'))]
        
        else:     
            selection_filter = data_dict[(data_dict.ages.str.contains(selected_age_str)) & \
                              (data_dict.sex.str.contains(selected_gender.value))]
#                              | (data_dict.sex == "Both"))] 
    
        
#         with data_output:
#             display(selection_filter)

    def selected_age_eventhandler(change):
        selection_filtering(change.new, selected_age.value)
    def selected_gender_eventhandler(change):
        selection_filtering(selected_gender.value, change.new)
    def selected_percentile_eventhandler(change):
        selection_filtering(selected_percentile.value, change.new)

    selected_age.observe(selected_age_eventhandler, names='value')
    selected_gender.observe(selected_gender_eventhandler, names='value')
    selected_percentile.observe(selected_percentile_eventhandler, names='value')

    list_of_variable_inputs = selection_filter["variables"].values[0:]
    variable_inputs = ', '.join(list_of_variable_inputs).replace(" ", "")
    variable_inputs = variable_inputs.split(',')
    
user_selection()

In [21]:
def get_demographics_for_selection():
# calculate percentile_input_per_variable
    clean_combine_census_and_geographic_data()
#     user_selection()
    
    global percentile_input, data

    data = pd.merge(acs_site_sum.loc[acs_site_sum['variables'].isin(variable_inputs)], \
                   selection_filter, how="outer", on="variables")    
    data["sum_in_area"] = data["sum_in_area"].astype(int)
    data.sort_values("sum_in_area", axis = 0, ascending = True, inplace = True)

    percentile_input = selected_percentile.value / 100
    
# split these up into the diff bins for different types of variable groups 
    global language, education, school_enrollment, family_household_income, nonfamily_household_income, household_type

    for item,i in enumerate(data):       
        language = data[(data["variable_group"].str.contains('Language'))]
        education = data[(data["variable_group"].str.contains('Educational Attainment'))]
        school_enrollment = data[(data["variable_group"].str.contains('School'))]
        family_household_income = data[(data["variable_group"].str.contains('Family'))]
        nonfamily_household_income = data[(data["variable_group"].str.contains('Nonfamily'))]
        household_type = data[(data["variable_group"].str.contains('Households'))]
#       travel_time_to_work = data[(data["variable_group"].str.contains('Travel Time'))].sort_values(by='sum_in_area')
        # means_of_transportation = data[(data["variable_group"].str.contains('Means of Transportation'))].sort_values(by='sum_in_area')
        
#Calculate individual percentile values
        global sum_for_percentile_language,sum_for_percentile_education,sum_for_percentile_school_enrollment,\
                sum_for_percentile_family_household_income,\
                sum_for_percentile_nonfamily_household_income, sum_for_percentile_household_type
        
        sum_for_percentile_language = language.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
        sum_for_percentile_language = sum_for_percentile_language.replace(sum_for_percentile_language, \
                                language.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str))
        
        sum_for_percentile_education = education.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
        sum_for_percentile_education = sum_for_percentile_education.replace(sum_for_percentile_education, \
                                education.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str))
        
        sum_for_percentile_school_enrollment = school_enrollment.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)    
        sum_for_percentile_school_enrollment = sum_for_percentile_school_enrollment.replace(sum_for_percentile_school_enrollment, \
                                school_enrollment.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str))
        
        sum_for_percentile_family_household_income = family_household_income.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
        sum_for_percentile_family_household_income = sum_for_percentile_family_household_income.replace(sum_for_percentile_family_household_income, \
                                family_household_income.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str))
        
        sum_for_percentile_nonfamily_household_income = nonfamily_household_income.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
        sum_for_percentile_nonfamily_household_income = sum_for_percentile_nonfamily_household_income.replace(sum_for_percentile_nonfamily_household_income, \
                                nonfamily_household_income.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str))        
        
        sum_for_percentile_household_type = household_type.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
        sum_for_percentile_household_type = sum_for_percentile_household_type.replace(sum_for_percentile_household_type, \
                                household_type.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str))
        
get_demographics_for_selection()

Received 12 entries, 12 total
Received 1 entries, 13 total


In [22]:
def parse_tables_for_percentile_value():
#generating new transposed table with only the two fields needed : variables and sum in area.
#using other variables makes transposition weird
    get_demographics_for_selection()
    
    global household_type_transposed, language_transposed, education_transposed, family_household_income_transposed, nonfamily_household_income_transposed,household_type_transposed, school_enrollment_transposed

    language_transposed = language.filter(["variables", "sum_in_area"]).T
    language_transposed.columns = language_transposed.iloc[0]
    language_transposed = language_transposed[1:]

    education_transposed = education.filter(["variables", "sum_in_area"]).T
    education_transposed.columns = education_transposed.iloc[0]
    education_transposed = education_transposed[1:]

    household_type_transposed = household_type.filter(["variables", "sum_in_area"]).T
    household_type_transposed.columns = household_type_transposed.iloc[0]
    household_type_transposed = household_type_transposed[1:]
    
    family_household_income_transposed = family_household_income.filter(["variables", "sum_in_area"]).T
    family_household_income_transposed.columns = family_household_income_transposed.iloc[0]
    family_household_income_transposed = family_household_income_transposed[1:]    
    
    nonfamily_household_income_transposed = nonfamily_household_income.filter(["variables", "sum_in_area"]).T
    nonfamily_household_income_transposed.columns = nonfamily_household_income_transposed.iloc[0]
    nonfamily_household_income_transposed = nonfamily_household_income_transposed[1:]  
    
    school_enrollment_transposed = school_enrollment.filter(["variables", "sum_in_area"]).T
    school_enrollment_transposed.columns = school_enrollment_transposed.iloc[0]
    school_enrollment_transposed = school_enrollment_transposed[1:]    

parse_tables_for_percentile_value()

Received 12 entries, 12 total
Received 1 entries, 13 total


In [23]:
def get_range_for_each_variable():
    parse_tables_for_percentile_value()
    
    global range_table, range_table_all, ranges, first_range, other_ranges
    transposed = [education_transposed, family_household_income_transposed, household_type_transposed, \
            language_transposed, nonfamily_household_income_transposed, school_enrollment_transposed]
    data.sort_values(by=['variable_group', 'sum_in_area'], ascending=[True, True], inplace=True)
    data_sorted = data.reset_index()
    
    ranges=[]
#     ranges.empty
    
    for df in transposed:
        for item, i in enumerate(df.columns):
            if item == 0:
                first_range = np.arange(df.max()[item]+1).astype(int)
                ranges.append([first_range])
            else:
                other_ranges = np.arange(df.min()[item-1]+1, \
                                       df.max()[item]+1).astype(int)
                ranges.append([other_ranges])

            range_table = pd.DataFrame(data=ranges, index=None, columns=["range_per_variable"])
            range_table = range_table.reset_index(drop=True)

    range_table_all = pd.merge(range_table, data_sorted, left_index=True, right_index=True, on=None)
    range_table_all["range_per_variable"] = range_table_all["range_per_variable"].astype(str)

get_range_for_each_variable()

Received 12 entries, 12 total
Received 1 entries, 13 total


In [24]:
# result_df
# result_df['range_per_variable'].replace(r'\s+|\\n', ' ', regex=True, inplace=True) 
# result_df

In [25]:
def generate_info_for_text():
    get_range_for_each_variable()
    
    global descriptors, result_df
    descriptors = []
    result_df = pd.DataFrame(columns=None)
    for i in range_table_all['range_per_variable']:
        if '\n' in range_table_all:
            range_table_all['range_per_variable'].replace(r'\s+|\\n', ' ', regex=True, inplace=True) 
            
    sum_for_percentile_language = language.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
    sum_for_percentile_education = education.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
    sum_for_percentile_school_enrollment = school_enrollment.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)    
    sum_for_percentile_family_household_income = family_household_income.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
    sum_for_percentile_nonfamily_household_income = nonfamily_household_income.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
    sum_for_percentile_household_type = household_type.sum_in_area.quantile(selected_percentile.value / 100).astype(int).astype(str)
    
    for item,i in enumerate(range_table_all.index):
        if sum_for_percentile_education.astype(int) > 0 :
            education_only = range_table_all[(range_table_all["variable_group"].str.contains('Educational'))]
            result = education_only[education_only["range_per_variable"].str.contains(sum_for_percentile_education)]
            result_df = result_df.append(result, ignore_index = True)
    # #         result_variable = result["variable_name"].values[0]
            descriptors.append(result["variable_name"].values[0])

        if sum_for_percentile_language.astype(int) > 0 :
            language_only = range_table_all[(range_table_all["variable_group"].str.contains('Language'))]
            result = language_only[language_only["range_per_variable"].str.contains(sum_for_percentile_language)]
            descriptors.append(result["variable_name"].values[0])
            result_df = result_df.append(result, ignore_index = True)

        if sum_for_percentile_household_type.astype(int) > 0:
            household_type_only = range_table_all[(range_table_all["variable_group"].str.contains('Households'))]
            result = household_type_only[household_type_only["range_per_variable"].str.contains(sum_for_percentile_household_type)]
            descriptors.append(result["variable_name"].values[0])
            result_df = result_df.append(result, ignore_index = True)

        if sum_for_percentile_family_household_income.astype(int) > 0:
            family_household_income_only = range_table_all[(range_table_all["variable_group"].str.contains('Family'))]
            result = family_household_income_only[family_household_income_only["range_per_variable"].str.contains(sum_for_percentile_family_household_income)]
            descriptors.append(result["variable_name"].values[0])
            result_df = result_df.append(result, ignore_index = True)

        if sum_for_percentile_nonfamily_household_income.astype(int) > 0:
            nonfamily_household_income_only = range_table_all[(range_table_all["variable_group"].str.contains('Nonfamily'))]
            result = nonfamily_household_income_only[nonfamily_household_income_only["range_per_variable"].str.contains(sum_for_percentile_nonfamily_household_income)]
            descriptors.append(result["variable_name"].values[0])
            result_df = result_df.append(result, ignore_index = True)

        if sum_for_percentile_school_enrollment.astype(int) > 0:
            school_enrollment_only = range_table_all[(range_table_all["variable_group"].str.contains('School'))]
            result = school_enrollment_only[school_enrollment_only["range_per_variable"].str.contains(sum_for_percentile_school_enrollment)]
            descriptors.result_df(result["variable_name"].values[0])
            result_df = result_df.append(result, ignore_index = True)

    descriptors = np.unique(descriptors)
    result_df = result_df.drop_duplicates()

generate_info_for_text()

Received 12 entries, 12 total
Received 1 entries, 13 total


In [67]:
def construct_narrative():
    generate_info_for_text()
    
    global result_df, resident_text, percentile_text, income_range, gender_text, subject_text, age_text,\
            household_type_text, income_text, school_text, language_text, education_text
    resident_text = []
    percentile_text = []
    income_range = []
    gender_text = []
    
    
    percentile_text = " is representative of the top "+ str(selected_percentile.value) + "% of this area's residents. "

    if (selected_gender.value == "Female"):
        gender_text = "she"
        subject_text = "woman"
    if (selected_age.value >= 5) & (selected_age.value <= 17):
            subject_text = "girl"
    elif (selected_gender.value == "Male"):
        gender_text = "he"
        subject_text = "man"
        if (selected_age.value >= 5) & (selected_age.value <= 17):
            subject_text = "boy"
    elif (selected_gender.value == "ALL"):
        gender_text = "they"
        subject_text = "person"


    for i in result_df['variable_name']:
        if 'Nonfamily Household Income In ' in i:
            household_type_text = ' household consisting of non-family members'
            result_df=result_df.loc[~result_df["variable_name"].str.contains('Family Household Income')]
        elif 'Family Household Income In ' in i:
            result_df=result_df.loc[~result_df["variable_name"].str.contains('Nonfamily Household Income')]
            household_type_text = ' household consisting of family members'
        else: 
            household_type_text=' household '

        if '10,999' or '14,999' in i:
            income_text = " lower income"
        if '24,999' or '34,999' or '49,999' in i:
            income_text = " middle income"
        if '74,999' or '94,999' or '149,999' in i:
            income_text = " wealthy"
        if '199,999' or '200,000' in i:
            income_text = " very wealthy"

        income = result_df.loc[result_df["variable_name"].str.contains('Income')]
        income_range = income["variable_name"].str.split('$').str[1]


        if 'Enrolled in college or graduate school' in i:
            school_text = gender_text.capitalize() + " is also a college or graduate school student."
        else:
            school_text = ''

        if 'English' in i:
            language_text = gender_text.capitalize() + " speaks only English at home. "
        if 'Spanish' in i:
            language_text = "In addition to English, " + gender_text + " speaks Spanish at home. "
        if 'Indo-European' in i:
            language_text = "In addition to English, " + gender_text + " speaks an Indo-European language at home. "            
        if 'Asian' in i:
            language_text = "In addition to English, " + gender_text + " speaks an Asian or Pacific Island language at home. "            
        if 'Other languages' in i:
            language_text = "In addition to English, " + gender_text + " speaks other languages at home. "

        if 'Less than high school graduate' in i:
            education_text = gender_text.capitalize() + ' does not have a high school degree.'
        if 'High school graduate (includes equivalency)' in i:
            education_text = gender_text.capitalize() + ' is a high school graduate.'
        if 'Some college' in i:
            education_text = gender_text.capitalize() + ' has attended some form of college but does not have a degree.'
        if 'Bachelor' in i:
            education_text = gender_text.capitalize() + " is well-educated and has at least a bachelor's degree."
        else:
            education_text = ''

    if (selected_age.value >= 5) & (selected_age.value <= 12):
        age_text = ''
        age_text = age_text.replace(age_text, age_text)
    elif (selected_age.value >= 13 ) & (selected_age.value <= 17):
        age_text = 'teenage'
        age_text = age_text.replace(age_text, age_text)
    elif (selected_age.value >= 18 ) & (selected_age.value <= 34):
        age_text = 'young'        
        age_text = age_text.replace(age_text, age_text)
    elif (selected_age.value >= 35 ) & (selected_age.value <= 64):
        age_text = 'middle-aged'        
        age_text = age_text.replace(age_text, age_text)
    elif (selected_age.value >= 65 ):
        age_text = 'senior'        
        age_text = age_text.replace(age_text, age_text)
    
    resident_text = "This " + age_text + " " + subject_text + percentile_text + \
                    gender_text.capitalize() + " lives in a" + income_text + household_type_text + \
                    "with an income in the $" + income_range + " range. " + \
                    language_text + \
                    education_text + \
                    school_text 

    with output:
        display(resident_text.values[0])

construct_narrative()

Received 9 entries, 9 total
Received 1 entries, 10 total


In [68]:
age_text

'young'

In [69]:
text_generation_button = Button(description="Generate Text")

In [70]:
#TEXT GENERATION
def text_generation(b):
    output.clear_output()
    data_output.clear_output()
#     selection_filtering(age_group, sex)
    user_selection()
#     get_demographics_for_selection()
#     parse_tables_for_percentile_value()
#     get_range_for_each_variable()
#     generate_info_for_text()
    
#     construct_narrative()    
#     get_data()
    display_dashboard()

# text_generation_button.on_click(text_generation)

In [71]:
# def get_data():
# # user_selection()
#     construct_narrative() 
#     global item_layout, explore_data
#     output.clear_output()
#     data_output.clear_output()

#     item_layout = widgets.Layout(margin='0 0 10px 0')

#     explore_data = range_table_all.filter(['sum_in_area', 'sex',\
#        'age_group', 'variable_group', 'variable_name'])
#     explore_data['sum_in_area'] = explore_data['sum_in_area'].astype(int)

In [77]:
def display_dashboard():
#     user_selection()
    output.clear_output()
    data_output.clear_output()
    
    item_layout = widgets.Layout(margin='0 0 10px 0')
    
    explore_data = range_table_all.filter(['sum_in_area', 'sex',\
       'age_group', 'variable_group', 'variable_name'])
    explore_data['sum_in_area'] = explore_data['sum_in_area'].astype(int)
    
    with output:
        display(resident_text.values[0])
    with data_output:
        display(explore_data)
        
    input_widgets = widgets.VBox(
        [selected_age, selected_gender, selected_percentile, text_generation_button],
        layout=item_layout)
    
    tab = widgets.Tab([output, data_output],
        layout=item_layout)
    tab.set_title(0, 'Narrative')
    tab.set_title(1, 'Dataset Exploration')
    
    dashboard = widgets.VBox([input_widgets, tab])
    display(dashboard)

display_dashboard()
m

VBox(children=(VBox(children=(BoundedIntText(value=25, description='Age:', max=99, min=5), ToggleButtons(descr…

Map(center=[40.73710578427421, -74.00188493571478], controls=(ZoomControl(options=['position', 'zoom_in_text',…

In [78]:
# text_generation_button.on_click(text_generation)

# input_widgets = widgets.VBox(
#     [selected_age, selected_gender, selected_percentile, text_generation_button],
#     layout=item_layout)

# tab = widgets.Tab([output, data_output],
#     layout=item_layout)
# tab.set_title(0, 'Narrative')
# tab.set_title(1, 'Dataset Exploration')

# dashboard = widgets.VBox([input_widgets, tab])

# display(dashboard)
# m

In [74]:
selected_percentile.value

90

In [79]:
resident_text.values

array(["This young man is representative of the top 50% of this area's residents. He lives in a very wealthy household with an income in the $50,000 to 74,999 range. He speaks only English at home. ",
       "This young man is representative of the top 50% of this area's residents. He lives in a very wealthy household with an income in the $25,000 to 34,999 range. He speaks only English at home. "],
      dtype=object)

In [80]:
resident_text.values[0]

"This young man is representative of the top 50% of this area's residents. He lives in a very wealthy household with an income in the $50,000 to 74,999 range. He speaks only English at home. "

In [81]:
selected_age.value

25

In [42]:
percentile_text = []
percentile_text = " is representative of the top "+ str(selected_percentile.value) + "% of this area's residents. "
percentile_text

" is representative of the top 80% of this area's residents. "

In [37]:
# widget_control = WidgetControl(widget=text_generation_button, position='topright')
# m.add_control(widget_control)

# m