In [2]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
from fiona.crs import from_epsg

In [3]:
import urllib, json, requests 
import geojson

In [4]:
from ipyleaflet import Map, GeoData, GeoJSON, basemaps, basemap_to_tiles, Icon, Circle, Marker, LayerGroup, WidgetControl
import ipywidgets as widgets
from ipywidgets import Button 
from IPython.display import display, clear_output

In [5]:
#need this to stop numpy from returning truncated arrays 
import sys
np.set_printoptions(threshold=sys.maxsize)

In [6]:
#drawing basic map
center = (40.7210907,-73.9877836)
basemap = basemap_to_tiles(basemaps.CartoDB.Positron)

m = Map(layers=(basemap, ), center=center, zoom=15, min_zoom = 7, max_zoom = 20)

In [7]:
def extract_location():       
    global gdf, lat, lon
    
    lat = str(markerlocation[0])
    lon = str(markerlocation[1])
    
    df2 = pd.DataFrame(markerlocation)
    df=df2.transpose()
    df.columns=['Latitude','Longitude']

    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df.Longitude, df.Latitude), crs='epsg:4326')
    
    return gdf

In [8]:
draggable=False
marker_opacity=1
icon = Icon(icon_url='icon.png', icon_size=[15, 15])

marker = Marker(location=center, draggable=draggable, icon=icon, opacity=marker_opacity)

markerlocation = marker.location 

layer_group = LayerGroup(layers=(marker, ))
m.add_layer(layer_group)
   
def update_marker(**kwargs):
    
    if kwargs.get('type') == 'click':
        layer_group.clear_layers();
        
        marker = Marker(location=kwargs.get('coordinates'), draggable=draggable, icon=icon, opacity=marker_opacity, options=['rise_on_hover'])  
        
        global markerlocation
        markerlocation = marker.location 
        
        layer_group.add_layer(marker)
    
        draw_update_buffer(**kwargs)
    
m.on_interaction(update_marker)

In [9]:
def draw_update_buffer(**kwargs):     
    m.on_interaction(update_marker)
    extract_location()
    
    global half_mi
    half_mi=gdf.copy()
    half_mi['geometry'] = half_mi.geometry.buffer(.004,  cap_style=1, join_style=1)

    map_extent = gdf.copy()
    map_extent['geometry'] = map_extent.buffer(1,  cap_style=1, join_style=1)

    diff = gpd.overlay(map_extent, half_mi, how='difference')
    
    half_mi_difference = GeoData(geo_dataframe = diff,
                       style={'color': "black", \
                              'fillColor': "#000000", \
                              'fillOpacity': .2, \
                              'opacity': 1, \
                              'weight': 2},
                       name = "Test", crs='epsg:4326')

    layer_group.add_layer(half_mi_difference) 

In [10]:
# m

In [11]:
def import_censustracts():
    draw_update_buffer()
    
    bounding_box = half_mi.envelope
    df = gpd.GeoDataFrame(gpd.GeoSeries(bounding_box), columns=['geometry'])
    minx, miny, maxx, maxy = df.geometry.total_bounds
    bounds = minx, miny, maxx, maxy

    # census tracts link
    endpoint = 'https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/Tracts_Blocks/MapServer/4/query'
    s = requests.session()
    s.params = {
        'geometry': str(bounds),
        'geometryType': 'esriGeometryEnvelope',
        'inSR': 4326,
        'spatialRel': 'esriSpatialRelIntersects',
        'outFields': 'GEOID,STATE,COUNTY,TRACT,NAME,STGEOMETRY,OBJECTID',
        'returnGeometry': True,
        'f': 'geojson',        
    }
    start = 0
    done = False
    features = []
    crs = None
    while not done:
        r = s.get(endpoint, params={
            'resultOffset': start,
            'resultRecordCount': 32,
        })
        censusgeo = geojson.loads(r.text)
        newfeats = censusgeo.__geo_interface__['features']
        if newfeats:
            features.extend(newfeats)
            crs=censusgeo.__geo_interface__['crs']
            start += len(newfeats)
            print("Received", len(newfeats), "entries,", start, "total")
        else:
            done = True
    
    global tracts
    tracts = gpd.GeoDataFrame.from_features(features, crs=crs)
    return tracts

import_censustracts()

Received 12 entries, 12 total
Received 1 entries, 13 total


Unnamed: 0,geometry,GEOID,STATE,COUNTY,TRACT,NAME,OBJECTID
0,"POLYGON ((-73.98986 40.72053, -73.98962 40.720...",36061003001,36,61,3001,Census Tract 30.01,10843
1,"POLYGON ((-73.99326 40.72235, -73.99352 40.721...",36061003601,36,61,3601,Census Tract 36.01,10846
2,"POLYGON ((-73.99155 40.72709, -73.99179 40.726...",36061003800,36,61,3800,Census Tract 38,10847
3,"POLYGON ((-73.98788 40.71741, -73.98837 40.716...",36061001402,36,61,1402,Census Tract 14.02,26519
4,"POLYGON ((-73.98845 40.72328, -73.98864 40.722...",36061003002,36,61,3002,Census Tract 30.02,40986
5,"POLYGON ((-73.98454 40.71639, -73.98501 40.715...",36061001200,36,61,1200,Census Tract 12,45321
6,"POLYGON ((-73.99233 40.72491, -73.99260 40.724...",36061003602,36,61,3602,Census Tract 36.02,52962
7,"POLYGON ((-73.98705 40.72520, -73.98750 40.724...",36061003200,36,61,3200,Census Tract 32,56775
8,"POLYGON ((-73.99750 40.71407, -73.99744 40.714...",36061001600,36,61,1600,Census Tract 16,71680
9,"POLYGON ((-73.99442 40.71939, -73.99481 40.718...",36061001800,36,61,1800,Census Tract 18,71681


In [258]:
def download_acs():  
    state = tracts["STATE"].unique().tolist()
    state = ', '.join(map(str, state)).replace(" ", "")

    tract = tracts["TRACT"].unique().tolist()
    tract = ', '.join(map(str, tract)).replace(" ", "") 

    county = tracts["COUNTY"].unique().tolist()
    county = ', '.join(map(str, county)).replace(" ", "") 

    api_key = '9330dc4bf086a84f19fb412bb15f232507301de6'
    acs_url = f'https://api.census.gov/data/2018/acs/acs5/subject/'
    
    global acs_variables
    acs_variables_initial = 'S1603_C02_002E,S1603_C02_003E,S1603_C02_004E,S1603_C04_002E,S1603_C04_003E,S1603_C04_004E,S1601_C01_005E,S1601_C01_006E,S1601_C01_007E,S1601_C01_009E,S1601_C01_010E,S1601_C01_011E,S1601_C01_013E,S1601_C01_014E,S1601_C01_015E,S1601_C01_017E,S1601_C01_018E,S1601_C01_019E,S1901_C01_002E,S1901_C01_003E,S1901_C01_004E,S1901_C01_005E,S1901_C01_006E,S1901_C01_007E,S1901_C01_008E,S1901_C01_009E,S1901_C01_010E,S1901_C01_011E,S1901_C04_002E,S1901_C04_003E,S1901_C04_004E,S1901_C04_005E,S1901_C04_006E,S1901_C04_007E,S1901_C04_008E,S1901_C04_009E,S1901_C04_010E,S1901_C04_011E'
    acs_variables_additional = 'S1501_C01_002E,S1501_C01_004E,S1501_C01_003E,S1501_C01_005E,S1501_C01_017E,S1501_C01_018E,S1501_C01_020E,S1501_C01_021E,S1501_C01_023E,S1501_C01_024E,S1501_C01_025E,S1501_C01_026E,S1501_C03_002E,S1501_C03_003E,S1501_C03_004E,S1501_C03_005E,S1501_C03_017E,S1501_C03_018E,S1501_C03_020E,S1501_C03_021E,S1501_C03_023E,S1501_C03_024E,S1501_C03_026E,S1501_C03_027E,S1501_C05_002E,S1501_C05_003E,S1501_C05_004E,S1501_C05_005E,S1501_C05_017E,S1501_C05_018E,S1501_C05_020E,S1501_C05_021E,S1501_C05_023E,S1501_C05_024E,S1501_C05_026E,S1501_C05_027E,S1401_C01_030E,S1401_C01_032E,S1401_C01_034E'
    acs_variables = acs_variables_initial + "," + acs_variables_additional
    
    get_acs_initial = f'{acs_url}?&get={acs_variables_initial}&for=tract:{tract}&in=state:{state}%20county:{county}&key={api_key}'
    get_acs_additional = f'{acs_url}?&get={acs_variables_additional}&for=tract:{tract}&in=state:{state}%20county:{county}&key={api_key}'

    data_acs_initial=requests.get(get_acs_initial).json()
    data_acs_additional=requests.get(get_acs_additional).json()
    
    global acs, acs_initial, acs_additional
    acs_initial=pd.DataFrame(data_acs_initial[1:], columns=data_acs_initial[0])
    acs_additional=pd.DataFrame(data_acs_additional[1:], columns=data_acs_additional[0])

    acs=pd.merge(acs_initial, acs_additional, on='tract', how='left')

download_acs()

In [259]:
#any null rows?
test = acs.columns[acs.isnull().any()]
test

Index([], dtype='object')

In [289]:
def clean_combine_census_and_geographic_data():
    global acs_site_sum, acs_site
    tracts["area"]=tracts.area
    acs_tracts = pd.merge(tracts, acs, left_on='TRACT', right_on='tract', how='left')
    
    acs_site = gpd.overlay(half_mi, acs_tracts, how='intersection')
    acs_site["area_clipped"]=tracts_clipped.area 
    acs_site["ratio"] = acs_site["area_clipped"]/acs_site["area"]
    
    cols = acs_variables.split(",")
    acs_site[cols] = acs_site[cols].apply(pd.to_numeric, errors='coerce', axis=1)
    
    # if 'area_clipped' not in cols:
    #     cols.append("area_clipped")
    
    temp_df = acs_site[cols]    
    temp_df = temp_df.mul(acs_site.ratio, 0)
    acs_site.update(temp_df)

    acs_site_sum = pd.DataFrame(acs_site[cols].sum())

    acs_site_sum.reset_index(inplace=True)
    acs_site_sum.columns = ['variables', 'sum_in_area']
    
clean_combine_census_and_geographic_data()

In [15]:
data_dict = pd.read_csv("data-dictionary.csv")
data_dict

Unnamed: 0,sex,age_group,variable_group,variables,variable_name,variable_description,ages
0,Both,5 to 17 years,Language Spoken At Home,S1603_C02_002E,Speak Only English at Home,,"5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17"
1,Both,18 to 64 years,Language Spoken At Home,S1603_C02_003E,Speak Only English at Home,,"18, 19, 20, 21, 22, 23, 24, 25,26, 27, 28, 29,..."
2,Both,65 years and over,Language Spoken At Home,S1603_C02_004E,Speak Only English at Home,,"64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75..."
3,Both,5 to 17 years,Language Spoken At Home,S1603_C04_002E,Speak a Language other than English at Home,,"5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17"
4,Both,18 to 64 years,Language Spoken At Home,S1603_C04_003E,Speak a Language other than English at Home,,"18, 19, 20, 21, 22, 23, 24, 25,26, 27, 28, 29,..."
...,...,...,...,...,...,...,...
77,Male,18 to 24 years,School Enrollment,S1401_C01_032E,Enrolled in college or graduate school,,"18, 19, 20, 21, 22, 23, 24"
78,Female,18 to 24 years,School Enrollment,S1401_C01_034E,Enrolled in college or graduate school,,"18, 19, 20, 21, 22, 23, 24"
79,Both,15 to 19 years,School Enrollment,S0902_C01_003E,Enrolled in public school,,"15, 16, 17, 18, 19"
80,Both,15 to 19 years,School Enrollment,S0902_C01_004E,Enrolled in private school,,"15, 16, 17, 18, 19"


In [16]:
# only use this if i decide to add in the option to select all
ALL = 'ALL'
def user_options_sorted_values_plus_ALL(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique
# 

In [17]:
def user_options_sorted_values(array):
    unique = array.unique().tolist()
    unique.sort()
    return unique

In [18]:
output = widgets.Output()

In [19]:
data_output = widgets.Output()

In [20]:
def user_selection():
    
    output.clear_output()
    data_output.clear_output() 
    
    global selected_age, selected_gender, selected_percentile, text_generation_button, selection_filter, variable_inputs
#     selected_age = widgets.Dropdown(options = user_options_sorted_values(data_dict.age_group),\
#                                     value = "5 to 17 Years")
    selected_age = widgets.BoundedIntText(min=5, max=99, value=25, step=1, description='Age:')
    selected_gender = widgets.ToggleButtons(options = user_options_sorted_values(data_dict.sex),\
                                            value = "Male",\
                                            description='Sex:', \
                                            disabled=False, button_style='', )
    selected_percentile = widgets.IntSlider(min=0, max=100, step=10, value=50, description='Percentile:',)
    text_generation_button = Button(description="Generate Text")
    
#     display(selected_age, selected_gender, selected_percentile, output)
    
    selected_age_str= str(selected_age.value)

    selection_filter = data_dict[(data_dict.ages.str.contains(selected_age_str)) & 
                              ((data_dict.sex == selected_gender.value) |
                              (data_dict.sex == "Both"))]   
    
    with data_output:
            display(selection_filter)
    
    def selection_filtering(age_group, sex):
        output.clear_output()
        data_output.clear_output()

        selected_age_str= str(selected_age.value)
        
        selection_filter = data_dict[(data_dict.ages.str.contains(selected_age_str)) & 
                                  ((data_dict.sex == selected_gender.value) |
                                    (data_dict.sex == "Both"))]   
        with data_output:
            display(selection_filter)

    def selected_age_eventhandler(change):
        selection_filtering(change.new, selected_age.value)
    def selected_gender_eventhandler(change):
        selection_filtering(selected_gender.value, change.new)
    def selected_percentile_eventhandler(change):
        selection_filtering(selected_percentile.value, change.new)

    selected_age.observe(selected_age_eventhandler, names='value')
    selected_gender.observe(selected_gender_eventhandler, names='value')
    selected_percentile.observe(selected_percentile_eventhandler, names='value')

    list_of_variable_inputs = selection_filter["variables"].values[0:]
    variable_inputs = ', '.join(list_of_variable_inputs).replace(" ", "")
    variable_inputs = variable_inputs.split(',')
    
user_selection()

In [85]:
def descriptor_generation():
# eventually would need to split these up into the diff bins for different types of variable groups 
    global df
    df = pd.merge(acs_site_sum.loc[acs_site_sum['variables'].isin(variable_inputs)], \
                   selection_filter, how="right", on="variables")    

# calculate percentile_input_per_variable
    global sum_for_percentile, percentile_input, percentile_table, percentile_table_transposed

#adding in 0 value to dataframe...to have complete table for the percentiles 
    percentile_table = df.append({'sum_in_area' : 0, 'variables':'Baseline_Value',\
                                  'variable_group':'Baseline_Group'}, ignore_index=True)
    percentile_table.sort_values("sum_in_area", axis = 0, ascending = True, inplace = True)

#10th percentile = 0.1
    percentile_input = selected_percentile.value / 100
    sum_for_percentile = percentile_table.sum_in_area.quantile(percentile_input).astype(int).astype(str)

# split these up into the diff bins for different types of variable groups 
    global language, education, household_income, school_enrollment

    for item,i in enumerate(percentile_table):       
        language = percentile_table[(percentile_table["variable_group"].str.contains('Language'))]
        education = percentile_table[(percentile_table["variable_group"].str.contains('Educational Attainment'))]
        school_enrollment = percentile_table[(percentile_table["variable_group"].str.contains('School'))]
        household_income = percentile_table[(percentile_table["variable_group"].str.contains('Income'))]
#       travel_time_to_work = percentile_table[(percentile_table["variable_group"].str.contains('Travel Time'))]
        # means_of_transportation = percentile_table[(percentile_table["variable_group"].str.contains('Means of Transportation'))]

#generating new transposed table with only the two fields needed : variables and sum in area.
#using other variables makes transposition weird
    global language_transposed, education_transposed, household_income_transposed, school_enrollment_transposed

    language_transposed = language.filter(["variables", "sum_in_area"]).T
    language_transposed.columns = language_transposed.iloc[0]
    language_transposed = language_transposed[1:]

    education_transposed = education.filter(["variables", "sum_in_area"]).T
    education_transposed.columns = education_transposed.iloc[0]
    education_transposed = education_transposed[1:]
    
    household_income_transposed = household_income.filter(["variables", "sum_in_area"]).T
    household_income_transposed.columns = household_income_transposed.iloc[0]
    household_income_transposed = household_income_transposed[1:]    
    
#     school_enrollment_transposed = school_enrollment.filter(["variables", "sum_in_area"]).T
#     school_enrollment_transposed.columns = school_enrollment_transposed.iloc[0]
#     school_enrollment_transposed = school_enrollment_transposed[1:]    
    
descriptor_generation()

In [195]:
def range_per_variable_calculation():
    global range_table, ranges, language_range, education_range, household_income_range
    ranges=[]
    
    for item, i in enumerate(language_transposed.columns):
        language_range = np.arange(language_transposed.min()[item-1]+1, \
                               language_transposed.max()[item]+1).astype(int)
        ranges.append([language_range])
        
    for item, i in enumerate(education_transposed.columns):
        education_range = np.arange(education_transposed.min()[item-1]+1, \
                               education_transposed.max()[item]+1).astype(int)
        ranges.append([education_range])
        
    for item, i in enumerate(household_income_transposed.columns):
        household_income_range = np.arange(household_income_transposed.min()[item-1]+1, \
                               household_income_transposed.max()[item]+1).astype(int)
        ranges.append([household_income_range])
        
#     for item, i in enumerate(school_enrollment_transposed.columns):
#         each_range = np.arange(school_enrollment_transposed.min()[item-1]+1, \
#                                school_enrollment_transposed.max()[item]+1).astype(int)
#         ranges.append([each_range])
#NEED TO MAKE IT SO THE TABLE IS A COMBO OF 
    range_table = pd.DataFrame(data=ranges, index=None, columns=["sum_in_area_ranges"])
    test = range_table.loc[range_table["sum_in_area_ranges"].str.contains(household_income_range.astype(str))]
#     for item, i in enumerate(household_income_range):
#         print(range_table.loc[range_table["sum_in_area_ranges"].isin(household_income_range)])
#     df = pd.merge(acs_site_sum.loc[acs_site_sum['variables'].isin(variable_inputs)], \
#                    selection_filter, how="right", on="variables")            
#     if range_table["sum_in_area_ranges"] == household_income_range:
#         range_table["range_type"] = "household_income_range"
#     range_table['TaxStatus'] = np.where(df.Public == 1, True, np.where(df.Public == 2, False))

    global percentile_table, range_table_cumulative
    
    #do for each
#     percentile_table = percentile_table.reset_index()
#     range_table = range_table.reset_index()

#     range_table_cumulative = pd.merge(range_table, percentile_table, left_index=True, right_index=True, on=None)
#     range_table_cumulative["sum_ranges"] = range_table_cumulative["sum_in_area_x"].astype(str)
#     range_table_cumulative

range_per_variable_calculation()

TypeError: unhashable type: 'numpy.ndarray'

In [154]:
household_income_range_string = ",".join(household_income_range.astype(str))
# test = range_table.loc[range_table["sum_in_area_ranges"].str.contains(household_income_range_string)]
range_table["sum_in_area_ranges"].astype(str).str.contains(household_income_range_string)
# type(household_income_range_string)

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
Name: sum_in_area_ranges, dtype: bool

In [158]:
# range_table["sum_in_area_ranges"].astype(str)
household_income_range_string

'175,176,177,178,179,180,181,182,183,184,185,186'

In [157]:
range_table

Unnamed: 0,sum_in_area_ranges
0,[]
1,"[613, 614, 615, 616, 617, 618, 619, 620, 621, ..."
2,"[2694, 2695, 2696, 2697, 2698, 2699, 2700, 270..."
3,"[5219, 5220, 5221, 5222, 5223, 5224, 5225, 522..."
4,"[5842, 5843, 5844, 5845, 5846, 5847, 5848, 584..."
5,"[14365, 14366, 14367, 14368, 14369, 14370, 143..."
6,[]
7,"[6782, 6783, 6784, 6785, 6786, 6787, 6788, 678..."
8,"[8284, 8285, 8286, 8287, 8288, 8289, 8290, 829..."
9,"[13123, 13124, 13125, 13126, 13127, 13128, 131..."


In [105]:
def generate_base_text():
    for item,i in enumerate(range_table_cumulative.index):
        if range_table_cumulative["sum_ranges"].str.contains(sum_for_percentile).any():
            global descriptors, result, result_variable, result_text
            
            result = range_table_cumulative[range_table_cumulative['sum_ranges'].str.contains(sum_for_percentile)]
            result_variable = result["variables"].values[0]

            #So returning the data dictionary for this 
            dict_result = data_dict[data_dict["variables"].str.contains(result_variable)]
            result_text = dict_result["variable_name"].values[0]
            
            #then append to empty dataframe that will have the data for text generation
            descriptors=[]
            descriptors.append([result_text])

#     else:
#         print("Error!")
        
generate_base_text()

In [50]:
# descriptors ... so need to fix this. 
# need to actually make this loop

In [25]:
text_generation_button = Button(description="Generate Text")

In [39]:
def replace_text():
    global percentile_string
    percentile_string = "in the " + str(selected_percentile.value) + "th percentile"
    print(percentile_string)
    
    if (selected_gender.value == "Total Population"):
        gender_text = "she"
    elif (selected_gender.value == "Total Population"):
        gender_text = "he"
        
    if (selected_age.value == "5 to 17 Years") & (selected_gender.value == "Total Population"):
        subject_description = "As a young woman, you "
    else:
        subject_description = "As a young woman, you "
    
# #     for loop 
#     if (result_text == "Speak a Language other than English at Home"):
#         language_text = gender_text + " is at least bilingual, as they speak a language other than english at home."
    
    replaceable_string = """
        Oh look I'm doing a 
        thing
        For 
    """
    global resident_string
    resident_string = replaceable_string
    
    with output:
        display(resident_string)
    
replace_text()

in the 50th percentile


In [40]:
# resident_string = "help"

#TEXT GENERATION
def text_generation(b):
    output.clear_output()
    data_output.clear_output()
    
    user_selection()
    descriptor_generation()
    range_per_variable_calculation()
    generate_base_text()
    
    replace_text()    
    
text_generation_button.on_click(text_generation)

In [42]:
def display_dashboard():
    user_selection()
    
    item_layout = widgets.Layout(margin='0 0 10px 0')
    
    with output:
        display(resident_string)

    input_widgets = widgets.VBox(
        [selected_age, selected_gender, selected_percentile, text_generation_button],
        layout=item_layout)
    
    tab = widgets.Tab([output, data_output],
        layout=item_layout)
    tab.set_title(0, 'Narrative')
    tab.set_title(1, 'Dataset Exploration')
    
    dashboard = widgets.VBox([input_widgets, tab])
    display(dashboard)

display_dashboard()

VBox(children=(VBox(children=(BoundedIntText(value=25, description='Age:', max=99, min=5), ToggleButtons(descr…

In [None]:
# widget_control = WidgetControl(widget=text_generation_button, position='topright')
# m.add_control(widget_control)

# m