In [70]:
import json
import folium
import datetime
import requests
import numpy as np
import pandas as pd
import geopandas as gpd
from bs4 import BeautifulSoup
from geopandas.tools import sjoin
from geopy.geocoders import Nominatim
from shapely.geometry import Point, Polygon

## Dealing with GeoJson

In [2]:
geo_df = gpd.read_file("data/chicago_boundaries.geojson")
geo_df.sort_values("pri_neigh").head(10)

Unnamed: 0,pri_neigh,sec_neigh,shape_area,shape_len,geometry
52,Albany Park,"NORTH PARK,ALBANY PARK",53542230.819,39339.016439,(POLYGON ((-87.70403771340104 41.9735515838182...
42,Andersonville,ANDERSONVILLE,9584592.89906,12534.092625,(POLYGON ((-87.66114249176968 41.9763032707800...
78,Archer Heights,"ARCHER HEIGHTS,WEST ELSDON",55922505.7212,31880.02103,(POLYGON ((-87.71436934735939 41.8260405636423...
8,Armour Square,"ARMOUR SQUARE,CHINATOWN",17141468.6356,24359.189625,(POLYGON ((-87.62920071904188 41.8471270613852...
21,Ashburn,ASHBURN,135460337.208,54818.154632,(POLYGON ((-87.71254775561138 41.7573373338274...
70,Auburn Gresham,AUBURN GRESHAM,105065353.602,46757.721716,(POLYGON ((-87.63990005778429 41.7561457832194...
94,Austin,AUSTIN,170037750.826,55473.345911,(POLYGON ((-87.75619515209075 41.9154695475812...
9,Avalon Park,"AVALON PARK,CALUMET HEIGHTS",34852737.7366,27630.822534,(POLYGON ((-87.58565529833413 41.7515019433031...
12,Avondale,"IRVING PARK,AVONDALE",55290595.482,34261.933404,(POLYGON ((-87.68798678784357 41.9361039411813...
93,Belmont Cragin,"BELMONT CRAGIN,HERMOSA",109099407.211,43311.706886,(POLYGON ((-87.74142999502229 41.9169848303604...


In [3]:
neighs = geo_df.pri_neigh.tolist()

In [4]:
def plot_map(geo_df, data=None, column_name=None, map_name="map.html"):
    m = folium.Map(location=[41.8755616, -87.6244212], tiles='Mapbox Bright', zoom_start=11) ## This location is Chicago
    
    if data is None:
        folium.GeoJson(
            geo_df,
            name='geojson'
        ).add_to(m)
    else:
        bins = list(data.quantile(np.linspace(0, 1, 15))) ## to use bins, gotta check fill_color
        folium.Choropleth(
            geo_data=geo_df,
            name='choropleth',
            data=data,
            columns=["pri_neigh", column_name],
            key_on='feature.properties.pri_neigh',
            fill_color='YlGnBu',
            fill_opacity=0.7,
            line_opacity=0.2
            #bins=bins
        ).add_to(m)
        
    m.save(map_name)

In [5]:
plot_map(geo_df)

## Dealing with Food Inspections

In [19]:
inspections = pd.read_csv("data/food-inspections.csv")
inspections = inspections.dropna(subset=["Latitude", "Longitude"]) ## we drop if we do not know the location
inspections["date"] = pd.to_datetime(inspections["Inspection Date"])
inspections.columns

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date',
       'Inspection Type', 'Results', 'Violations', 'Latitude', 'Longitude',
       'Location', 'Historical Wards 2003-2015', 'Zip Codes',
       'Community Areas', 'Census Tracts', 'Wards', 'date'],
      dtype='object')

In [20]:
## key: key that will be used to join
def assign_neigh(geo_df, data, key, latitude="Latitude", longitude="Longitude", verbose=False):
    geometry = [Point(x, y) for x, y in zip(data[longitude], data[latitude])]
    crs = {'init': 'epsg:4326'}
    data_to_join = gpd.GeoDataFrame(data[[key, latitude, longitude]], 
                                       crs=crs,
                                       geometry=geometry)
    points_to_neigh = sjoin(data_to_join, geo_df, how='left')
    
    def get_nearest_neigh(point, geo_df):
        idx = geo_df.geometry.distance(point).idxmin()
        return geo_df.loc[idx, 'pri_neigh']
    
    neigh_not_found = points_to_neigh[pd.isna(points_to_neigh.pri_neigh)]
    
    if verbose:
        print("There are {} points without an exact neighborhood".format(len(neigh_not_found)))
    
    neigh_not_found.pri_neigh = neigh_not_found.copy().geometry.apply(get_nearest_neigh, geo_df=geo_df)
    points_to_neigh.loc[neigh_not_found.index] = neigh_not_found
    
    if verbose:
        print("There are {} points without an exact neighborhood"\
              .format(len(points_to_neigh[pd.isna(points_to_neigh.pri_neigh)])))
    
    return points_to_neigh[[key, "pri_neigh", "sec_neigh"]]
    

In [21]:
inspection_location = assign_neigh(geo_df=geo_df, data=inspections, key="Inspection ID", verbose=True)

There are 2819 points without an exact neighborhood
There are 0 points without an exact neighborhood


In [22]:
inspections = inspections.merge(inspection_location, on="Inspection ID")

In [24]:
neigh_inspections = inspections.groupby("pri_neigh").count()["Inspection ID"]
plot_map(geo_df, data=neigh_inspections, column_name="Inspection ID", map_name="map.html")

In [31]:
inspections.Results.unique()

[Pass, Pass w/ Conditions, Out of Business, Fail, No Entry, Not Ready, Business Not Located]
Categories (7, object): [Pass, Pass w/ Conditions, Out of Business, Fail, No Entry, Not Ready, Business Not Located]

In [44]:
neigh_results = inspections.groupby(["pri_neigh", "Results"]).count()["Inspection ID"]
neigh_results

pri_neigh     Results             
Albany Park   Business Not Located      NaN
              Fail                    670.0
              No Entry                128.0
              Not Ready                39.0
              Out of Business         474.0
                                      ...  
Wrigleyville  No Entry                 53.0
              Not Ready                26.0
              Out of Business          75.0
              Pass                    631.0
              Pass w/ Conditions      194.0
Name: Inspection ID, Length: 686, dtype: float64

In [57]:
percentage_results = (neigh_results.div(neigh_inspections) * 100).reset_index()
percentage_results

Unnamed: 0,pri_neigh,Results,Inspection ID
0,Albany Park,Business Not Located,
1,Albany Park,Fail,19.493745
2,Albany Park,No Entry,3.724178
3,Albany Park,Not Ready,1.134711
4,Albany Park,Out of Business,13.791097
...,...,...,...
681,Wrigleyville,No Entry,4.424040
682,Wrigleyville,Not Ready,2.170284
683,Wrigleyville,Out of Business,6.260434
684,Wrigleyville,Pass,52.671119


In [59]:
percentage_pass = percentage_results[percentage_results.Results == "Pass"]
percentage_pass

Unnamed: 0,pri_neigh,Results,Inspection ID
5,Albany Park,Pass,49.636311
12,Andersonville,Pass,50.544662
19,Archer Heights,Pass,54.689042
26,Armour Square,Pass,64.571429
33,Ashburn,Pass,61.634435
...,...,...,...
656,West Ridge,Pass,48.276578
663,West Town,Pass,55.406270
670,Wicker Park,Pass,52.262864
677,Woodlawn,Pass,52.019231


In [60]:
plot_map(geo_df, data=percentage_pass, column_name="Inspection ID", map_name="map.html")

## Dealing with Income - for some reason geolocator is not working

In [75]:
income_df = pd.read_csv("data/economic-info.csv")
geolocator = Nominatim(user_agent="Lala")
#income_df.assign(latitude=lambda x: geolocator.geocode(x.["COMMUNITY ARE NAME"] + " Chicago").latitude,
#                 longitude=lambda x: geolocator.geocode(x.["COMMUNITY ARE NAME"] + " Chicago").longitude)
income_df.head()

Unnamed: 0,Community Area Number,COMMUNITY AREA NAME,PERCENT OF HOUSING CROWDED,PERCENT HOUSEHOLDS BELOW POVERTY,PERCENT AGED 16+ UNEMPLOYED,PERCENT AGED 25+ WITHOUT HIGH SCHOOL DIPLOMA,PERCENT AGED UNDER 18 OR OVER 64,PER CAPITA INCOME,HARDSHIP INDEX
0,1.0,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2.0,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3.0,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4.0,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5.0,North Center,0.3,7.5,5.2,4.5,26.2,57123,6.0


In [76]:
#income_location = assign_neigh(geo_df=geo_df, data=income_df, key="Community Area Name", verbose=True)
geolocator.geocode["Rogers Park Chicago"]

TypeError: 'method' object is not subscriptable