In [7]:
#from geopy.geocoders import Nominatim
import pandas as pd
#import folium
import json
import datetime
#import geopandas as gpd
#from shapely.geometry import Point, Polygon
import requests
from bs4 import BeautifulSoup
#from geopandas.tools import sjoin

### Basic Map with Boundaries

In [6]:
geolocator = Nominatim(user_agent="NMKG")
chicago_location = geolocator.geocode("Andersonville Chicago")
chicago_location.address

'Andersonville Park, 5233, Summerdale, Edgewater, Chicago, Cook County, Illinois, 60640, United States'

In [3]:
## reasing geojson
geo_df = gpd.read_file("data/chicago_boundaries.geojson")
print(geo_df["pri_neigh"].nunique() == len(geo_df))
geo_df.sort_values("pri_neigh").head()

True


Unnamed: 0,pri_neigh,sec_neigh,shape_area,shape_len,geometry
52,Albany Park,"NORTH PARK,ALBANY PARK",53542230.819,39339.016439,(POLYGON ((-87.70403771340104 41.9735515838182...
42,Andersonville,ANDERSONVILLE,9584592.89906,12534.092625,(POLYGON ((-87.66114249176968 41.9763032707800...
78,Archer Heights,"ARCHER HEIGHTS,WEST ELSDON",55922505.7212,31880.02103,(POLYGON ((-87.71436934735939 41.8260405636423...
8,Armour Square,"ARMOUR SQUARE,CHINATOWN",17141468.6356,24359.189625,(POLYGON ((-87.62920071904188 41.8471270613852...
21,Ashburn,ASHBURN,135460337.208,54818.154632,(POLYGON ((-87.71254775561138 41.7573373338274...


In [35]:
m = folium.Map(location=[chicago_location.latitude, chicago_location.longitude], zoom_start=11)

In [36]:
geojson_data = json.load(open("data/chicago_boundaries.geojson"))
folium.GeoJson(
    geojson_data,
    name='geojson'
).add_to(m)

<folium.features.GeoJson at 0x1a1d79ac508>

### Get Neighbourhood <-> Community Area - Maybe will be useful?

In [131]:
url = "https://en.wikipedia.org/wiki/List_of_neighborhoods_in_Chicago"
response = requests.get(url)
html_soup = BeautifulSoup(response.text, 'html.parser')
table_content = html_soup.find("tbody").find_all("tr")[1:]
map_list = []
i = 0
for entry in table_content:
    neigh_community = entry.find_all("td")
    if len(neigh_community) > 0:
        neigh = neigh_community[0].text if "<a>" not in neigh_community[0].text else neigh_community.find_all(a)[0].text
        community = neigh_community[1].text if "<a>" not in neigh_community[1].text else neigh_community.find_all(a)[0].text
        map_list.append([neigh.strip(), community.strip()])

map_list = pd.DataFrame(map_list, columns=["neigh", "community"])
map_list.head()

Unnamed: 0,neigh,community
0,Albany Park,Albany Park
1,Altgeld Gardens,Riverdale
2,Andersonville,Edgewater
3,Archer Heights,Archer Heights
4,Armour Square,Armour Square


<h3>Inspection -- out of order businesses</h3>

In [38]:
food_df = pd.read_csv("data/food-inspections.csv")
food_df.head()
food_df = food_df.dropna(subset=["Latitude", "Longitude"])
food_df['date'] = pd.to_datetime(food_df["Inspection Date"])

In [None]:
#food_df.groupby('Results').count()
pd.set_option('display.max_columns', None)

numShutDown = 0
numShutDownwithFail = 0
out_of_business_df = food_df[food_df.Results == 'Out of Business']
out_of_business_companies = out_of_business_df['DBA Name'].unique()
for company in out_of_business_companies:
    numShutDown += 1
    company_results = food_df[food_df['DBA Name'] == company]['Results']
    if 'Fail' in company_results.values:
        numShutDownwithFail += 1

In [72]:
print(numShutDownwithFail, numShutDown, numShutDownwithFail / numShutDown)

6817 13997 0.48703293562906336


So 48% of the out of business restaurants have had at least 1 fail in their past. Now let's look at it the other way around. What percentage of restaurants got out of order if they have had at least 1 fail in their past.

In [75]:
numFail = 0
numFailwithShutDown = 0
fail_df = food_df[food_df.Results == 'Fail']
fail_companies = fail_df['DBA Name'].unique()
for company in fail_companies:
    numFail += 1
    company_results = food_df[food_df['DBA Name'] == company]['Results']
    if 'Out of Business' in company_results.values:
        numFailwithShutDown += 1


In [76]:
print(numFailwithShutDown, numFail, numFailwithShutDown / numFail)

6817 15462 0.440887336696417


44% of the restaurants got out of business after having received at least 1 fail.

### Map with Inspections

In [8]:
inspections = pd.read_csv("data/food-inspections.csv")
inspections.head()
inspections = inspections.dropna(subset=["Latitude", "Longitude"])

In [67]:
inspections.columns

Index(['Inspection ID', 'DBA Name', 'AKA Name', 'License #', 'Facility Type',
       'Risk', 'Address', 'City', 'State', 'Zip', 'Inspection Date',
       'Inspection Type', 'Results', 'Violations', 'Latitude', 'Longitude',
       'Location', 'Historical Wards 2003-2015', 'Zip Codes',
       'Community Areas', 'Census Tracts', 'Wards'],
      dtype='object')

In [68]:
inspections["date"] = pd.to_datetime(inspections["Inspection Date"])

In [69]:
## getting only 2019 to make it easier
inspections_2019 = inspections[inspections["date"].dt.year == 2019]
inspections_2019["date"].tail()

14130   2019-01-02
14131   2019-01-02
14132   2019-01-02
14133   2019-01-02
14134   2019-01-02
Name: date, dtype: datetime64[ns]

In [71]:
## create a geodataframe with the location info of inspections
geometry = [Point(x, y) for x, y in zip(inspections_2019.Longitude, inspections_2019.Latitude)]
crs = {'init': 'epsg:4326'}
inspections_to_join = gpd.GeoDataFrame(inspections_2019[["Inspection ID", "Latitude", "Longitude"]], 
                                       crs=crs,
                                       geometry=geometry)
inspections_to_join.head()

Unnamed: 0,Inspection ID,Latitude,Longitude,geometry
0,2320315,41.714168,-87.655291,POINT (41.7141680989703 -87.65529116028439)
1,2320342,41.913588,-87.682203,POINT (41.9135877900482 -87.6822028354253)
2,2320328,41.808025,-87.720037,POINT (41.80802515275297 -87.72003743037237)
3,2320319,41.808025,-87.720037,POINT (41.80802515275297 -87.72003743037237)
4,2320228,41.807662,-87.73148,POINT (41.8076619936005 -87.7314802731113)


In [72]:
## merging locations and polygons to find the neighbourhood

points_to_neigh = sjoin(inspections_to_join, geo_df, how='left')
inspections_2019 = inspections_2019.merge(points_to_neigh, left_on="Inspection ID", right_on="Inspection ID")
inspections_2019.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,...,Wards,date,Latitude_y,Longitude_y,geometry,index_right,pri_neigh,sec_neigh,shape_area,shape_len
0,2320315,SERENDIPITY CHILDCARE,SERENDIPITY CHILDCARE,2216009.0,Daycare Above and Under 2 Years,Risk 1 (High),1300 W 99TH ST,CHICAGO,IL,60643.0,...,,2019-10-23,41.714168,-87.655291,POINT (41.7141680989703 -87.65529116028439),,,,,
1,2320342,YOLK TEST KITCHEN,YOLK TEST KITCHEN,2589655.0,Restaurant,Risk 1 (High),1767 N MILWAUKEE AVE,CHICAGO,IL,60647.0,...,,2019-10-23,41.913588,-87.682203,POINT (41.9135877900482 -87.6822028354253),,,,,
2,2320328,LAS ASADAS MEXICAN GRILL,LAS ASADAS MEXICAN GRILL,2583309.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,...,,2019-10-23,41.808025,-87.720037,POINT (41.80802515275297 -87.72003743037237),,,,,
3,2320319,LA PALAPITA,LA PALAPITA,2694702.0,Restaurant,Risk 1 (High),3834 W 47TH ST,CHICAGO,IL,60632.0,...,,2019-10-23,41.808025,-87.720037,POINT (41.80802515275297 -87.72003743037237),,,,,
4,2320228,47TH ST CANTINA,47TH ST CANTINA,2678250.0,Liquor,Risk 3 (Low),4311 W 47TH ST,CHICAGO,IL,60632.0,...,,2019-10-22,41.807662,-87.73148,POINT (41.8076619936005 -87.7314802731113),,,,,


In [74]:
inspection_per_neigh = inspections_2019.groupby("pri_neigh").count()["Inspection ID"]
inspection_per_neigh.head()

Series([], Name: Inspection ID, dtype: int64)

In [48]:
m = folium.Map(location=[chicago_location.latitude, chicago_location.longitude], tiles='Mapbox Bright', zoom_start=11)

folium.Choropleth(
    geo_data=geo_df,
    name='choropleth',
    data=inspection_per_neigh,
    columns=["pri_neigh", "Inspection ID"],
    key_on='feature.properties.pri_neigh',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2
).add_to(m)

<folium.features.Choropleth at 0x1a1ddd50bc8>

In [49]:
set(geo_df["pri_neigh"].unique()) - set(inspections_2019["pri_neigh"].unique())
## These are the neighborhood without inspections in 2019

{'Burnside', 'Grant Park'}

In [50]:
set(inspections_2019["pri_neigh"].unique()) - set(geo_df["pri_neigh"].unique())
## There are NaN because they are outside the boundary given by the city

{nan}

In [51]:
not_in_map = inspections_2019[inspections_2019["pri_neigh"].isna()]
for idx, row in not_in_map.iterrows():
    folium.Marker([row.Latitude_x, row.Longitude_x], popup=idx).add_to(m)

In [52]:
m.save("map.html")

In [182]:
(not_in_map.Latitude_y.unique(), not_in_map.Longitude_y.unique())

(array([42.0085364 , 41.89224916, 41.89233781]),
 array([-87.91442844, -87.60951805, -87.60404476]))

#### If you check this map, you will see two things that look weird
1) If we intersect the polygons provided by the government and the points of inspections, there are 3 points (and 263 inspections) that are "not anywhere". However the problem is because of really small erorrs in measurement, it is actually quite easy to set it by hand. Have a look at the map.
2) The black neighbourhood actually did not have any inspections lol

### Map with Income

In [217]:
eco_info = pd.read_csv("data/economic-info.csv")
eco_info.head()

Unnamed: 0,area_number,area_name,housing_crowded,household_below_poverty,unemployment_16,wo_hs_25,aged_18_64,per_capita_income,hardship_index
0,1,Rogers Park,7.7,23.6,8.7,18.2,27.5,23939,39
1,2,West Ridge,7.8,17.2,8.8,20.8,38.5,23040,46
2,3,Uptown,3.8,24.0,8.9,11.8,22.2,35787,20
3,4,Lincoln Square,3.4,10.9,8.2,13.4,25.5,37524,17
4,5,North Center,0.3,7.5,5.2,4.5,26.2,57123,6


In [218]:
m = folium.Map(location=[chicago_location.latitude, chicago_location.longitude], zoom_start=11)

folium.Choropleth(
    geo_data=geo_df,
    name='choropleth',
    data=eco_info,
    columns=["area_name", "per_capita_income"],
    key_on='feature.properties.pri_neigh',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2
).add_to(m)

<folium.features.Choropleth at 0x1de3489c988>

In [219]:
m.save("map.html")

In [220]:
set(eco_info["area_name"]) - set(geo_df["pri_neigh"].unique())

{'East Garfield Park',
 'Forest Glen',
 'Greater Grand Crossing',
 'Humboldt park',
 'McKinley Park',
 'Montclaire',
 'Near North Side',
 'Near West Side',
 'South Lawndale',
 'Washington Height',
 'West Englewood',
 'West Garfield Park'}

In [221]:
set(geo_df["pri_neigh"].unique()) - set(eco_info["area_name"])

{'Andersonville',
 'Boystown',
 'Bucktown',
 'Chinatown',
 'East Village',
 'Galewood',
 'Garfield Park',
 'Gold Coast',
 'Grand Crossing',
 'Grant Park',
 'Greektown',
 'Humboldt Park',
 'Jackson Park',
 'Little Italy, UIC',
 'Little Village',
 'Magnificent Mile',
 'Mckinley Park',
 'Millenium Park',
 'Montclare',
 'Museum Campus',
 'Old Town',
 'Printers Row',
 'River North',
 'Rush & Division',
 'Sauganash,Forest Glen',
 'Sheffield & DePaul',
 'Streeterville',
 'Ukrainian Village',
 'United Center',
 'Washington Heights',
 'West Loop',
 'Wicker Park',
 'Wrigleyville'}

In [224]:
geolocator = Nominatim(user_agent="NMKG")
geolocator.geocode("Jackson Park Chicago")

Location(Jackson Park, Woodlawn, Chicago, Cook County, Illinois, United States of America, (41.78323175, -87.5804432439724, 0.0))

In [228]:
geolocator = Nominatim(user_agent="NMKG")
geolocator.geocode("River North Chicago")

Location(North Chicago, Lake County, Illinois, USA, (42.325578, -87.8411818, 0.0))

See that there are some areas that are actually in other regions but it is not necessarily a neighborhood. Gotta clean it somehow.

### How I imagine working with it: we can define the neighbourhood we have in the geojson. With these neighbourhoods set, whenever we have a dataset we clean it and fix it so it fits the neighbourhoods we have. If we use another as reference, we will never be able to plot it.

### Map with Crime

In [4]:
crime = pd.read_csv("data/crime-info-2019.csv")
crime.columns = list(map(lambda x: x.lower(), crime.columns))
crime["date"] = pd.to_datetime(crime["date"])
crime = crime.dropna(subset=["location"])
crime.head()

Unnamed: 0,id,case number,date,block,iucr,primary type,description,location description,arrest,domestic,...,ward,community area,fbi code,x coordinate,y coordinate,year,updated on,latitude,longitude,location
1,11863954,JC476438,2019-01-01 00:01:00,062XX W FLETCHER ST,266,CRIM SEXUAL ASSAULT,PREDATORY,RESIDENCE,False,False,...,36.0,19.0,2,1134271.0,1920282.0,2019,10/20/2019 03:56:02 PM,41.937458,-87.781946,"(41.937457995, -87.781946088)"
2,11857593,JC468729,2019-01-01 00:01:00,011XX N KEDZIE AVE,810,THEFT,OVER $500,OTHER,False,False,...,26.0,23.0,6,1154827.0,1907315.0,2019,10/16/2019 04:05:26 PM,41.901488,-87.706747,"(41.901487821, -87.706746857)"
3,11847094,JC456079,2019-01-01 00:01:00,081XX S HARPER AVE,1753,OFFENSE INVOLVING CHILDREN,SEX ASSLT OF CHILD BY FAM MBR,RESIDENCE,False,True,...,8.0,45.0,2,1187889.0,1851369.0,2019,10/04/2019 03:59:59 PM,41.747242,-87.587093,"(41.747241844, -87.587093052)"
4,11846507,JC455304,2019-01-01 00:01:00,012XX N MARION CT,1582,OFFENSE INVOLVING CHILDREN,CHILD PORNOGRAPHY,RESIDENCE,False,False,...,1.0,24.0,17,1163773.0,1908489.0,2019,10/04/2019 03:59:59 PM,41.904525,-87.673854,"(41.90452536, -87.6738541)"
5,11848145,JC457348,2019-01-01 00:01:00,003XX S HAMLIN BLVD,1752,OFFENSE INVOLVING CHILDREN,AGG CRIM SEX ABUSE FAM MEMBER,APARTMENT,False,True,...,28.0,26.0,17,1151093.0,1898340.0,2019,10/05/2019 04:04:25 PM,41.876934,-87.720698,"(41.876933506, -87.72069769)"


In [5]:
crime_location = crime[["id", "latitude", "longitude"]]
geometry = [Point(x, y) for x, y in zip(crime_location.longitude, crime_location.latitude)]
crs = {'init': 'epsg:4326'}
crime_location = gpd.GeoDataFrame(crime_location, crs=crs, geometry=geometry)
crime_to_neigh = sjoin(crime_location, geo_df, how="left")
crime_to_neigh.head()

Unnamed: 0,id,latitude,longitude,geometry,index_right,pri_neigh,sec_neigh,shape_area,shape_len
1,11863954,41.937458,-87.781946,POINT (-87.781946088 41.937457995),93.0,Belmont Cragin,"BELMONT CRAGIN,HERMOSA",109099407.211,43311.706886
2,11857593,41.901488,-87.706747,POINT (-87.70674685700001 41.901487821),4.0,Humboldt Park,HUMBOLDT PARK,125010425.593,46126.751351
3,11847094,41.747242,-87.587093,POINT (-87.58709305200001 41.747241844),9.0,Avalon Park,"AVALON PARK,CALUMET HEIGHTS",34852737.7366,27630.822534
4,11846507,41.904525,-87.673854,POINT (-87.6738541 41.90452536),88.0,Wicker Park,"WICKER PARK,WEST TOWN",26853193.0926,21992.660946
5,11848145,41.876934,-87.720698,POINT (-87.72069768999999 41.876933506),5.0,Garfield Park,GARFIELD PARK,89976069.5947,44460.91922


In [6]:
crime_to_neigh.isnull().any()

id             False
latitude       False
longitude      False
geometry       False
index_right     True
pri_neigh       True
sec_neigh       True
shape_area      True
shape_len       True
dtype: bool

In [7]:
crime_wo_region = crime_to_neigh[pd.isna(crime_to_neigh.pri_neigh)]
len(crime_wo_region)

562

In [8]:
crime_to_region = crime_to_neigh.dropna()
len(crime_to_region)

206181

In [9]:
crime = crime.merge(crime_to_region, left_on="id", right_on="id")
n_crimes_neigh = crime.groupby("pri_neigh").count()["id"]
n_crimes_neigh.head()

pri_neigh
Albany Park       1770
Andersonville      268
Archer Heights     639
Armour Square      438
Ashburn           1759
Name: id, dtype: int64

In [10]:
m = folium.Map(location=[chicago_location.latitude, chicago_location.longitude], tiles='Mapbox Bright', zoom_start=11)

folium.Choropleth(
    geo_data=geo_df,
    name='choropleth',
    data=n_crimes_neigh,
    columns=["pri_neigh", "crime"],
    key_on='feature.properties.pri_neigh',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2
).add_to(m)

<folium.features.Choropleth at 0x190abf5a0c8>

In [11]:
for idx, row in crime_wo_region.iterrows():
    folium.Marker([row.latitude, row.longitude], popup="id").add_to(m)

In [12]:
m.save("map.html")

### In this dataset, there many things in the boundary

In [13]:
geo_df.head()

Unnamed: 0,pri_neigh,sec_neigh,shape_area,shape_len,geometry
0,Grand Boulevard,BRONZEVILLE,48492503.1554,28196.837157,(POLYGON ((-87.60670812560372 41.8168137713739...
1,Printers Row,PRINTERS ROW,2162137.97139,6864.247156,(POLYGON ((-87.62760697485348 41.8743709778537...
2,United Center,UNITED CENTER,32520512.7053,23101.363745,(POLYGON ((-87.66706868914602 41.8888518776954...
3,Sheffield & DePaul,SHEFFIELD & DEPAUL,10482592.2987,13227.049745,(POLYGON ((-87.65833494805533 41.9216614422918...
4,Humboldt Park,HUMBOLDT PARK,125010425.593,46126.751351,(POLYGON ((-87.74059567509266 41.8878231689323...


In [14]:
crime_wo_region.head()

Unnamed: 0,id,latitude,longitude,geometry,index_right,pri_neigh,sec_neigh,shape_area,shape_len
344,11554021,42.019399,-87.675049,POINT (-87.675049485 42.01939923699999),,,,,
631,11553241,41.703805,-87.720821,POINT (-87.72082053700001 41.703805075),,,,,
996,11553871,42.019399,-87.675049,POINT (-87.675049485 42.01939923699999),,,,,
2009,11555751,41.865519,-87.769873,POINT (-87.769873088 41.865519477),,,,,
2182,11556182,41.803033,-87.752816,POINT (-87.752815958 41.803032525),,,,,


In [33]:
geo_df.geometry.distance(crime_wo_region.iloc[0].geometry).idxmin()

48

### We can get the nearest neighbourhood as follows

In [48]:
def get_nearest_neigh(point, geo_df):
    idx = geo_df.geometry.distance(point).idxmin()
    return geo_df.loc[idx, 'pri_neigh']

na_regions = crime_wo_region.copy().geometry.apply(get_nearest_neigh, geo_df=geo_df)
na_regions.head()

344         Rogers Park
631     Mount Greenwood
996         Rogers Park
2009             Austin
2182     Garfield Ridge
Name: geometry, dtype: object

In [51]:
crime_wo_region["location"] = na_regions
crime_wo_region.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,id,latitude,longitude,geometry,index_right,pri_neigh,sec_neigh,shape_area,shape_len,location
344,11554021,42.019399,-87.675049,POINT (-87.675049485 42.01939923699999),,Rogers Park,,,,Rogers Park
631,11553241,41.703805,-87.720821,POINT (-87.72082053700001 41.703805075),,Mount Greenwood,,,,Mount Greenwood
996,11553871,42.019399,-87.675049,POINT (-87.675049485 42.01939923699999),,Rogers Park,,,,Rogers Park
2009,11555751,41.865519,-87.769873,POINT (-87.769873088 41.865519477),,Austin,,,,Austin
2182,11556182,41.803033,-87.752816,POINT (-87.752815958 41.803032525),,Garfield Ridge,,,,Garfield Ridge


In [52]:
n_crimes_na_neigh = crime_wo_region.groupby("location").count()["id"]
n_crimes_na_neigh.head()

location
Ashburn     54
Austin      91
Beverly     23
Clearing    16
Dunning     19
Name: id, dtype: int64

In [57]:
n_crimes_total = n_crimes_na_neigh.add(n_crimes_neigh, fill_value=0)

In [58]:
m = folium.Map(location=[chicago_location.latitude, chicago_location.longitude], tiles='Mapbox Bright', zoom_start=11)

folium.Choropleth(
    geo_data=geo_df,
    name='choropleth',
    data=n_crimes_total,
    columns=["pri_neigh", "id"],
    key_on='feature.properties.pri_neigh',
    fill_color='YlGn',
    fill_opacity=0.7,
    line_opacity=0.2
).add_to(m)

<folium.features.Choropleth at 0x190abe79708>

In [59]:
m.save("map.html")