In [1]:
import geopandas as gpd
import pandas as pd
import numpy as np
from shapely.geometry import Point, Polygon
import matplotlib.pyplot as plt
from fiona.crs import from_epsg

In [2]:
import urllib, json, requests 
import geojson

In [3]:
from ipyleaflet import Map, GeoData, GeoJSON, basemaps, basemap_to_tiles, Icon, Circle, Marker, LayerGroup, WidgetControl
import ipywidgets as widgets
from ipywidgets import Button 
from IPython.display import display, clear_output

In [4]:
#need this to stop numpy from returning truncated arrays 
import sys
np.set_printoptions(threshold=sys.maxsize)

In [5]:
def import_censustracts():

    # census tracts link
    endpoint = 'https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/Tracts_Blocks/MapServer/4/query'
    s = requests.session()
    s.params = {
        'where': 'GEOID=36061003001 OR GEOID=36061002900 OR GEOID=36061003601 OR GEOID=36061004300 OR GEOID=36061000800 OR GEOID=36061000600',
        'inSR': 4326,
        'spatialRel': 'esriSpatialRelIntersects',
        'outFields': 'GEOID,STATE,COUNTY,TRACT,NAME,STGEOMETRY,OBJECTID',
        'returnGeometry': True,
        'f': 'geojson',        
    }
    start = 0
    done = False
    features = []
    crs = None
    while not done:
        r = s.get(endpoint, params={
            'resultOffset': start,
            'resultRecordCount': 32,
        })
        censusgeo = geojson.loads(r.text)
        newfeats = censusgeo.__geo_interface__['features']
        if newfeats:
            features.extend(newfeats)
            crs=censusgeo.__geo_interface__['crs']
            start += len(newfeats)
            print("Received", len(newfeats), "entries,", start, "total")
        else:
            done = True
    
    global tracts
    tracts = gpd.GeoDataFrame.from_features(features, crs=crs)
    return tracts

import_censustracts()

Received 6 entries, 6 total


Unnamed: 0,geometry,GEOID,STATE,COUNTY,TRACT,NAME,OBJECTID
0,"POLYGON ((-74.00559 40.71217, -74.00544 40.712...",36061002900,36,61,2900,Census Tract 29,4534
1,"POLYGON ((-73.98986 40.72053, -73.98962 40.720...",36061003001,36,61,3001,Census Tract 30.01,10843
2,"POLYGON ((-73.99326 40.72235, -73.99352 40.721...",36061003601,36,61,3601,Census Tract 36.01,10846
3,"POLYGON ((-73.99881 40.72302, -73.99798 40.722...",36061004300,36,61,4300,Census Tract 43,10850
4,"POLYGON ((-73.99744 40.71407, -73.99750 40.714...",36061000800,36,61,800,Census Tract 8,19841
5,"POLYGON ((-73.99256 40.71439, -73.99247 40.713...",36061000600,36,61,600,Census Tract 6,29413


In [6]:
def download_acs():  
    state = tracts["STATE"].unique().tolist()
    state = ', '.join(map(str, state)).replace(" ", "")

    tract = tracts["TRACT"].unique().tolist()
    tract = ', '.join(map(str, tract)).replace(" ", "") 

    county = tracts["COUNTY"].unique().tolist()
    county = ', '.join(map(str, county)).replace(" ", "") 

    api_key = '9330dc4bf086a84f19fb412bb15f232507301de6'
    acs_url = f'https://api.census.gov/data/2018/acs/acs5/subject/'
    
    global acs_variables
    acs_variables = 'S1603_C01_001E,S1603_C01_012E,S1603_C01_013E,S1603_C01_014E,S1603_C01_015E,S1603_C01_016E,S1603_C02_001E,S1603_C02_002E,S1603_C02_003E,S1603_C02_004E,S1603_C04_001E,S1603_C04_002E,S1603_C04_003E,S1603_C04_004E'
    
    get_acs = f'{acs_url}?&get={acs_variables}&for=tract:{tract}&in=state:{state}%20county:{county}&key={api_key}'

    data_acs=requests.get(get_acs).json()
    
    global acs
    acs=pd.DataFrame(data_acs[1:], columns=data_acs[0])

download_acs()

In [7]:
def clean_combine_census_and_geographic_data():
    tracts["area"]=tracts.area
    tracts_clipped = pd.merge(tracts, acs, left_on='TRACT', right_on='tract', how='left')
    
    acs_site=tracts_clipped.copy()

    cols = acs_variables.split(",")
    acs_site[cols] = acs_site[cols].apply(pd.to_numeric, errors='coerce', axis=1)
    
    global acs_site_sum
    acs_site_sum = pd.DataFrame(acs_site[cols].sum())
    acs_site_sum.reset_index(inplace=True)
    acs_site_sum.columns = ['variables', 'sum_in_area']
    
clean_combine_census_and_geographic_data()

In [8]:
data_dict = pd.read_csv("data-dictionary.csv")
data_dict

Unnamed: 0,sex,age_group,variable_group,variables,variable_name
0,both,5 to 17 Years,Language Spoken At Home,S1603_C02_002E,Speak Only English at Home
1,both,18 to 64 Years,Language Spoken At Home,S1603_C02_003E,Speak Only English at Home
2,both,65 Years and Over,Language Spoken At Home,S1603_C02_003E,Speak Only English at Home
3,both,5 to 17 Years,Language Spoken At Home,S1603_C04_002E,Speak a Language other than English at Home
4,both,18 to 64 Years,Language Spoken At Home,S1603_C04_003E,Speak a Language other than English at Home
5,both,65 Years and Over,Language Spoken At Home,S1603_C04_004E,Speak a Language other than English at Home
6,both,5 to 17 Years,Language Spoken At Home,S1601_C01_005E,Spanish
7,both,18 to 64 Years,Language Spoken At Home,S1601_C01_006E,Spanish
8,both,65 Years and Over,Language Spoken At Home,S1601_C01_007E,Spanish
9,both,5 to 17 Years,Language Spoken At Home,S1601_C01_009E,Other Indo-European languages


In [9]:
# only use this if i decide to add in the option to select all
#     ALL = 'ALL'
#     def user_options_sorted_values_plus_ALL(array):
    #     unique = array.unique().tolist()
    #     unique.sort()
    #     unique.insert(0, ALL)
    #     return unique
# 

In [10]:
def user_options_sorted_values(array):
    unique = array.unique().tolist()
    unique.sort()
    return unique

In [11]:
def user_selection():
    output = widgets.Output()
    
    global selected_age, selected_gender, selected_percentile, text_generation_button, selection_filter, variable_inputs
    selected_age = widgets.Dropdown(options = user_options_sorted_values(data_dict.age_group),\
                                    value = "5 to 17 Years")
    selected_gender = widgets.ToggleButtons(options = user_options_sorted_values(data_dict.sex),\
                                            value = "both",\
                                            description='Sex:', \
                                            disabled=False, button_style='', )
    selected_percentile = widgets.IntSlider(min=0, max=100, step=10, value=50, description='Percentile:',)
    text_generation_button = Button(description="Generate Text")
    
    display(selected_age, selected_gender, selected_percentile, output)
            
    selection_filter = data_dict[(data_dict.age_group == selected_age.value) & 
                                  (data_dict.sex == selected_gender.value)]
    
    with output:
            display(selection_filter)
    
    def selection_filtering(age_group, sex):
        output.clear_output()
        
        selection_filter = data_dict[(data_dict.age_group == selected_age.value) & 
                                  (data_dict.sex == selected_gender.value)]
            
    #     if (age_group == ALL) & (sex == ALL):
    #         selection_filter = data_dict
    #     elif (age_group == ALL):
    #         selection_filter = data_dict[data_dict.age_group == age_group]
    #     elif (sex == ALL):
    #         selection_filter = data_dict[data_dict.sex == sex]
    #     else:
    #         selection_filter = data_dict[(data_dict.age_group == selected_age.value) & 
    #                                   (data_dict.sex == selected_gender.value)]

        with output:
            display(selection_filter)

    def selected_age_eventhandler(change):
        selection_filtering(change.new, selected_age.value)
    def selected_gender_eventhandler(change):
        selection_filtering(selected_gender.value, change.new)
    def selected_percentile_eventhandler(change):
        selection_filtering(selected_percentile.value, change.new)

    selected_age.observe(selected_age_eventhandler, names='value')
    selected_gender.observe(selected_gender_eventhandler, names='value')
    selected_percentile.observe(selected_percentile_eventhandler, names='value')

    list_of_variable_inputs = selection_filter["variables"].values[0:]
    variable_inputs = ', '.join(list_of_variable_inputs).replace(" ", "")
    variable_inputs = variable_inputs.split(',')
    
user_selection()

Dropdown(index=1, options=('18 to 64 Years', '5 to 17 Years', '65 Years and Over '), value='5 to 17 Years')

ToggleButtons(description='Sex:', options=('both',), value='both')

IntSlider(value=50, description='Percentile:', step=10)

Output()

In [13]:
def descriptor_generation():
# eventually would need to split these up into the diff bins for different types of variable groups 
    df = pd.merge(acs_site_sum.loc[acs_site_sum['variables'].isin(variable_inputs)], \
                   selection_filter, how="left", on="variables")
        
# eventually would need to split these up into the diff bins for different types of variable groups 
    
    global language, education, income, household_type, household_income, \
                          travel_time_to_work, means_of_transportation
    
    for i in df:       
        language = df[(df["variable_group"].str.contains('Language'))]
        # education = df[(df["variable_group"].str.contains('Educational Attainment'))]
        # household_type = df[(df["variable_group"].str.contains('Household Type'))]
        # household_income = df[(df["variable_group"].str.contains('Household Income'))]
        # travel_time_to_work = df[(df["variable_group"].str.contains('Travel Time'))]
        # means_of_transportation = df[(df["variable_group"].str.contains('Means of Transportation'))]

# calculate percentile_input_per_variable
        global sum_for_percentile, percentile_input, percentile_table, percentile_table_transposed

#adding in 0 value to dataframe...to have complete table for the percentiles 
        percentile_table = df.append({'sum_in_area' : 0, 'variables':'Baseline_Value'}, ignore_index=True)
        percentile_table.sort_values("sum_in_area", axis = 0, ascending = True, inplace = True)

#10th percentile = 0.1
        percentile_input = selected_percentile.value / 100
        sum_for_percentile = percentile_table.sum_in_area.quantile(percentile_input).astype(int).astype(str)

#generating new transposed table with only the two fields needed : variables and sum in area.
#using other variables makes transposition weird
        percentile_table_transposed = percentile_table.filter(["variables", "sum_in_area"]).T
        percentile_table_transposed.columns = percentile_table_transposed.iloc[0]
        percentile_table_transposed = percentile_table_transposed[1:]
    
descriptor_generation()

In [14]:
def range_per_variable_calculation():
    ranges=[]
    
    for item, i in enumerate(percentile_table_transposed.columns):
        each_range = np.arange(percentile_table_transposed.min()[item-1]+1, \
                               percentile_table_transposed.max()[item]+1).astype(int)
        ranges.append([each_range])
    
    range_table = pd.DataFrame(data=ranges, index=None, columns=["sum_in_area"])
    
    global percentile_table, range_table_cumulative
    #do for each
    for item, i in enumerate(range_table):
        percentile_table = percentile_table.reset_index()
        range_table = range_table.reset_index()

        range_table_cumulative = pd.merge(range_table, percentile_table, left_index=True, right_index=True, on=None)
        range_table_cumulative["sum_ranges"] = range_table_cumulative["sum_in_area_x"].astype(str)

        range_table_cumulative

range_per_variable_calculation()

In [40]:
def generate_descriptors():
    global descriptors, result, result_variable, result_text

    for item,i in enumerate(range_table_cumulative.index):
        if range_table_cumulative["sum_ranges"].str.contains(sum_for_percentile).any():
            result = range_table_cumulative[range_table_cumulative['sum_ranges'].str.contains(sum_for_percentile)]
            result_variable = result ["variables"].values[0]

            #So returning the data dictionary for this 
            dict_result = data_dict[data_dict["variables"].str.contains(result_variable)]
            result_text = dict_result["variable_name"].values[0]
            
            #then append to empty dataframe that will have the data for text generation
            descriptors=[]
            descriptors.append([result_text])
    
            descriptors
    
    print(result_variable,":", result_text)

#     else:
#         print("Error!")
        
generate_descriptors()

S1603_C02_002E : Speak Only English at Home


In [None]:
display(text_generation_button)

#TEXT GENERATION
def text_generation(b):
    user_selection()
#     display(selected_age, selected_gender, selected_percentile, text_generation_button, output)

    percentile_string = "in the " + str(selected_percentile.value) + "th percentile"
    
    if (selected_gender.value == "Total Population"):
        gender_text = "she"
    elif (selected_gender.value == "Total Population"):
        gender_text = "he"
        
    if (selected_age.value == "5 to 17 Years") & (selected_gender.value == "Total Population"):
        subject_description = "As a young woman, you "
    else:
        subject_description = "As a young woman, you "
    
#     for loop 
    if (result_text == "Speak a Language other than English at Home"):
        language_text = gender_text + " is at least bilingual, as they speak a language other than english at home."
        
    replaceable_string = """
        Oh look I'm doing a 
        thing
        For 
    """
    resident_string = percentile_string + replaceable_string + subject_description
    
    print(resident_string )
    
# text_generation()
text_generation_button.on_click(text_generation)

In [None]:
result_text

In [None]:
display(button, output)

def on_button_clicked(b):
    output.clear_output()
    
    global state, county
    state, county = find_location()
    
    global acs1, acs2, acs3, acs4, acs5, acs6
    acs1, acs2, acs3, acs4, acs5, acs6 = download_acs()
    
    global acs
    acs=join_acs()
    
    global tracts_df
    tracts_df = import_censustracts()
    
    with output:
        print("Button clicked.")

button.on_click(on_button_clicked)

In [None]:
#duplicate basically

df_array = np.array(df.T.onlyEnglish)
# df_array = np.array(df)
range_min = df_array.min()
range_max = df_array.max()
range_max