In [1]:
'''
File name: project.ipynb
Author: ...Jose, Mohamed Ndoye, Raphael Strebel
Date created: 03/11/2019
Date last modified: ...
Python Version: 3.7.4
''';

<a id="up"></a>
# Food Inspections in Chicago

 - [Load Databases](#load-databases)
 - [Complete Datasets](#complete-datasets)
 - [Basic Statistics](#basic-stats)

In [2]:
# useful : https://www.sustainabilist.com/blog/chicago-data-analysis-a-internship-project

import pandas as pd

# TODO : Add to README "download libraries geopandas, vincent,..." with a short description to explain its use
import geopandas as gpd

import vincent
vincent.core.initialize_notebook() 

from utils import constants as cst
from utils import clean_database
from utils import web_scraping_google_maps as ws
from utils import areas_handler


import folium
import json
import math

# Set auto-reload 
%load_ext autoreload
%autoreload 2

<a id = 'load-databases'></a>
## Load Databases

In this section we load and clean the databases.

[Table of Contents](#up)

In [None]:
# Load the food inspections dataframe
food_inspections_DF = pd.read_csv(cst.FOOD_INSPECTIONS_PATH, sep = ',', header = 0, 
                   names = cst.FOOD_INSPECTIONS_COL_NAMES, index_col = None, error_bad_lines=False
                   )

In [None]:
# Clean the dataframe
food_inspections_DF = clean_database.drop_columns_with_one_value(food_inspections_DF)

# must complete missing lat/lng values

food_inspections_DF.head()

In [None]:
# Load the socio-economic indicators dataframe
socio_economic_DF = pd.read_csv(cst.SOCIO_ECONOMIC_INDICATORS_PATH, sep = ',', header = 0, 
                   names = cst.SOCIO_ECONOMIC_COL_NAMES, index_col = None, error_bad_lines=False
                   )

In [None]:
# Clean the dataframe
socio_economic_DF = clean_database.clean_socio_economic_df(socio_economic_DF)

socio_economic_DF.head()

In [None]:
# Load the life expectancy dataframe
life_expectancy_DF = pd.read_csv(cst.LIFE_EXPECTANCY_PATH, sep = ',', header = 0, 
                   names = cst.LIFE_EXPECTANCY_COL_NAMES, index_col = None, error_bad_lines=False
                   )

In [None]:
# Clean the dataframe
life_expectancy_DF = clean_database.clean_socio_economic_df(life_expectancy_DF)

life_expectancy_DF.head()

<a id = 'complete-datasets'></a>
## Complete Datasets

### 2 problems : 
1. we only have the area name for the life exp. and the socio-eco DFs -> find the regions in sequence of lat/lng pairs that corresponds to the bounderies of an area. Then we can determine the region of the facility of the food_inspections dataframe and work only with the regions for the rest of the project (thoughts ?).
2. some entries in food_inspections_DF have no lat/lng pair -> must find it given their address

[Table of Contents](#up)

In [None]:
# merge socio-economic and life expectancy df's on the area number and names
socio_life_merged_DF = socio_economic_DF.merge(life_expectancy_DF, how="left", on=["community_area_num", "community_area_name"])# [["community_area_num", "community_area_name"]]

In [None]:
socio_life_merged_DF.head()

In [None]:
# Load the areas dataframe 
areas_DF = gpd.read_file(cst.AREAS_PATH)

In [None]:
# Clean the dataframe
areas_DF = clean_database.clean_areas_df(areas_DF)
areas_DF.head()

In [None]:
food_unknown_loc = food_inspections_DF[food_inspections_DF['lat'].isna()]

In [43]:
# Get unknown locations 
unknown_locations = areas_handler.get_unknown_locations(food_unknown_loc)

In [44]:
unknown_locations.head()

Unnamed: 0,address,lat,lng
0,3455-3459 S OGDEN AVE,41.804064,-88.05254
1,4000 N O'HARE FIELD,41.973101,-87.906768
2,6237 S HALSTED PKWY,40.517733,-88.940193
3,2009 S LAFIN ST,41.854799,-87.663454
4,7141 S Morgan (1000W) ST,41.763645,-87.648939


In [31]:
#check the locations not found by OpenStreetMaps


#For some locations there are questionable results even when looking manually
#7911 S WOODS BLDG no idea
#2300 N Childrens Plaza PLZ BLDG -> technically not the same building but the same area
#9513 S RIDGELAND AVE STE 3E -> not sure about this one
#2249 S 22ND AVE shows a private house whereas SUGARPIE BAKERY AND CAFE LLC is in a completel different place


unknown_locations[pd.isnull(unknown_locations['lat'])]

Unnamed: 0,address,lat,lng
9,2011 N GRIFFIN BLVD,,
10,65 CARMINE ST,,
14,N2660 HAYTON RD,,
22,39063 N OGDEN LN,,
76,7911 S WOODS BLDG,,
115,9513 S RIDGELAND AVE STE 3E,,
123,2249 S 22ND AVE,,


In [32]:
# display the locations 
chicago = [41.8333925,-87.7121486]
map = folium.Map(location = chicago)
regiondata = json.load(open(cst.AREAS_GEOJSON_PATH))
folium.GeoJson(regiondata).add_to(map)


uncertain_locations_feature=folium.FeatureGroup(name='Uncertain Points', show=False)
map.add_child(uncertain_locations_feature)
certain_locations_feature=folium.FeatureGroup(name='Certain Points', show=False)
map.add_child(certain_locations_feature)
folium.LayerControl().add_to(map)

for index, entry in unknown_locations.iterrows():
    if not math.isnan(entry['lat']):
        folium.Marker([entry['lat'], entry['lng']]).add_to(uncertain_locations_feature)

#just as a comparison we want to see the first 1000 restaurant that have known locations
for index, entry in food_inspections_DF[:1000].iterrows():
    if not math.isnan(entry['lat']):
        folium.Marker([entry['lat'], entry['lng']], icon=folium.Icon(color='red')).add_to(certain_locations_feature)
        
map

FileNotFoundError: [Errno 2] No such file or directory: '../data/Boundaries - Community Areas (current).json'

In [17]:
# Use unknonwn_locations to fill lat and lng in the original dataframe food_inspections_DF

food_unknown_loc = food_unknown_loc.reset_index().merge(unknown_locations, on="address", how='left').set_index('index')
food_unknown_loc = food_unknown_loc.drop(['lat_x', 'lng_x'], axis = 1)
food_unknown_loc = food_unknown_loc.rename(columns={'lat_y':'lat','lng_y':'lng'})
food_unknown_loc

food_inspections_DF.update(food_unknown_loc)
food_inspections_DF[food_inspections_DF['lat'].isna()]


Unnamed: 0,inspection_id,DBA_name,AKA_name,license_num,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,result,violations,lat,lng,location
200,2320829.0,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689756.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Not Ready,,,,
209,2320813.0,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689757.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,,,
1146,2315463.0,VEENO,VEENO,2578370.0,Restaurant,Risk 2 (Medium),2009 S LAFIN ST,CHICAGO,IL,60608.0,2019-10-08T00:00:00.000,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",,,
1191,2315467.0,VEENO,VEENO,2578371.0,Restaurant,Risk 3 (Low),2009 S LAFIN ST,CHICAGO,IL,60608.0,2019-10-08T00:00:00.000,License,Fail,,,,
4940,2301197.0,ICE CREAM ON WHEELS,ICE CREAM ON WHEELS,2678115.0,Mobile Food Dispenser,Risk 3 (Low),2011 N GRIFFIN BLVD,GRIFFITH,IN,46319.0,2019-07-16T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,,,
5665,2300474.0,BUSCIA'S,BUSCIA'S,2659186.0,,Risk 1 (High),65 CARMINE ST,NEW YORK,NY,10014.0,2019-06-28T00:00:00.000,License,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",,,
6337,2293796.0,THE COOKIE CRATE,THE COOKIE CRATE,2652809.0,Shared Kitchen User (Long Term),Risk 2 (Medium),N2660 HAYTON RD,NEW HOLSTEIN,WI,53061.0,2019-06-18T00:00:00.000,Canvass,Pass w/ Conditions,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",,,
10611,2282467.0,HADLEY HAPPENINGS,HADLEY HAPPENINGS,2652353.0,Shared Kitchen User (Long Term),Risk 1 (High),39063 N OGDEN LN,WADSWORTH,IL,60083.0,2019-04-03T00:00:00.000,Canvass,Pass,49. NON-FOOD/FOOD CONTACT SURFACES CLEAN - Com...,,,
83348,1575717.0,EMANUEL EARLY CHILDHOOD CENTER,,1979967.0,Daycare Combo 1586,Risk 1 (High),7911 S WOODS BLDG,CHICAGO,IL,60620.0,2015-09-17T00:00:00.000,Canvass,Out of Business,,,,
159684,659681.0,MEXICANDY DISTRIBUTOR INC,MEXICANDY DISTRIBUTOR INC,2130311.0,Shared Kitchen User (Long Term),Risk 3 (Low),9513 S RIDGELAND AVE STE 3E,OAK LAWN,IL,60453.0,2011-12-13T00:00:00.000,License,Pass,,,,


In [19]:
#Resolve are numbers and delete unknown areas
#this takes a while
food_inspections_DF[cst.AREA_NUM] = food_inspections_DF.apply(lambda row: areas_handler.get_area_num_from_lng_lat(row['lat'], row['lng'], areas_DF), axis=1)
print("Number of locations: " + str(food_inspections_DF.shape[0]))
food_inspections_DF = food_inspections_DF.dropna(subset=[cst.AREA_NUM])
print("Number of locations in the city of chicaco: " + str(food_inspections_DF.shape[0]))
food_inspections_DF[cst.AREA_NUM] = food_inspections_DF[cst.AREA_NUM].astype(int)

Number of locations: 192646
Number of locations in the city of chicaco: 192646


In [21]:
#TODO there is a SettingWithCopyWarning: 
#A value is trying to be set on a copy of a slice from a DataFrame.
#Try using .loc[row_indexer,col_indexer] = value instead

#create new dataframe with number of inspections per area
inspection_counts = food_inspections_DF[cst.AREA_NUM].value_counts().to_frame()
inspection_counts.reset_index(level=0, inplace=True)
inspection_counts['index'] = inspection_counts['index'].astype(str)
inspection_counts.sort_values(cst.AREA_NUM)
inspection_counts.head()

Unnamed: 0,index,community_area_num
0,8,14393
1,32,12660
2,28,10867
3,6,10149
4,24,8860


In [22]:
#Plot number of inspections per area as a heatmap
chicago = [41.8333925,-87.7121486]
map = folium.Map(location = chicago)
regiondata = json.load(open(cst.AREAS_GEOJSON_PATH))
folium.GeoJson(regiondata).add_to(map)

folium.Choropleth(geo_data=regiondata, data=inspection_counts,
             columns=['index', cst.AREA_NUM],
             key_on='feature.properties.area_numbe',
             fill_color='YlOrRd', fill_opacity=0.7, line_opacity=0.2,
             legend_name='Number of inspections per region').add_to(map)
map

<a id = 'basic-stats'></a>
## Basic Statistics

We report some statistics on the various dataframes.

[Table of Contents](#up)

In [None]:
corr = socio_life_merged_DF[cst.SOCIOECONOMIC_METRICS].corr()
corr

In [None]:
bad_metrics = set(['housing_crowded_perc', 'housholds_below_poverty_perc', 'aged_16_or_more_unemployed_perc', 
               'aged_25_or_more_without_high_school_diploma_perc', 'hardship_idx', 'aged_under_18_or_over_64_perc'])
good_metrics = set(['per_capita_income', 'life_exp_2010' ])
sign_kept = True

for c1 in cst.SOCIOECONOMIC_METRICS:
    for c2 in cst.SOCIOECONOMIC_METRICS:
        if (c1 in bad_metrics and c2 in bad_metrics) or (c1 in good_metrics and c2 in good_metrics):
            if corr[c][c] < 0:
                sign_kept = False
        elif (c1 in bad_metrics and c2 in good_metrics) or (c1 in good_metrics and c2 in bad_metrics):
            if corr[c][c] > 0:
                sign_kept = False
print(sign_kept)

In [None]:
#set correlation between each variable and itself to None in order to ignore it later
for c in corr.columns:
    corr[c][c] = None 
    
corrmax =pd.DataFrame(corr.idxmax()).rename({0: 'Strongest positive correlation'}, axis = 1)
corrmax['Correlation value'] = corr.max()
corrmax

In [None]:
corrmin =pd.DataFrame(corr.idxmin()).rename({0: 'Strongest negative correlation'}, axis = 1)
corrmin['Correlation value'] = corr.min()
corrmin

Of the above correlations, we notice certain things: Firstly, we can classify the indicators between good (life expectancy and per capita income) and bad (percentage of crowded houses, percentage of below porverty households, percentage of over 16 unemployed people, percentage fo over 25 people without a high school diploma, the hardship index, and the percentage of people under 18 and over 64), and the correlation between indicators either both good or both bad will always be positive, whereas the correlation between a good and a bad indicator will always be negative. 

We also notice that the percentage of people under 18 or over 64 is a strong negative indicator: it is more negatively correlated to per capita income than, for example, the percentage of houses living below the poverty line. 

It is indeed quite surprising that per capita average income is not more correlated to the percentage of houses living below the poverty line (correlation is -0.56). We plot the 2 metrics in order to see this:

One reason the linear correlation is so low is that the relationship is exponential. Also, the top 5 highest per capita neighbourhoods are not in the top 15 lowest poor households percentage. TODO: why does this happen??'?!!! where (very downtown). What are some other indicators in this 'mixed' (rich and poor people) neighbourhoods?? This is a cool direction to go in i think

In [None]:
scatter = vincent.Scatter(socio_life_merged_DF[['per_capita_income','housholds_below_poverty_perc']], iter_idx = 'housholds_below_poverty_perc')

In [None]:
scatter.axis_titles(x='Percentage of households below the poverty line', y='per_capita_income')


TODO: You can not click in the above plot. It'd be cool to be able to click and see the name of a neighbourhood. How can we do this with vincent? I kknow how with plotly but they said to use vincent. 

CONTINUATION: finish socioeconomic things (average, stard deviation, max min. How many people live in poor areas, how many people live in bad areas, etc.). Maybe classify areas in 4 manually. Plot where al this is.

Once socioeconomic things are done, let's look at food things (inspections: where are they happening)?

Finally, look at correlation between the 2.
