In [1]:
'''
File name: project.ipynb
Author: ..., Mohamed Ndoye, Raphael Strebel
Date created: 03/11/2019
Date last modified: ...
Python Version: 3.7.4
''';

<a id="up"></a>
# Food Inspections in Chicago

 - [Load Databases](#load-databases)
 - [Complete Datasets](#complete-datasets)
 - [Basic Statistics](#basic-stats)

In [2]:
# useful : https://www.sustainabilist.com/blog/chicago-data-analysis-a-internship-project

import pandas as pd
import geopandas as gpd

from utils import constants as cst
from utils import clean_database
from utils import web_scraping as ws
from utils import areas_handler

# Set auto-reload 
%load_ext autoreload
%autoreload 2

<a id = 'load-databases'></a>
## Load Databases

In this section we load and clean the databases.

[Table of Contents](#up)

In [3]:
# Load the food inspections dataframe
food_inspections_DF = pd.read_csv(cst.FOOD_INSPECTIONS_PATH, sep = ',', header = 0, 
                   names = cst.FOOD_INSPECTIONS_COL_NAMES, index_col = None, error_bad_lines=False
                   )

In [4]:
# Clean the dataframe
food_inspections_DF = clean_database.drop_columns_with_one_value(food_inspections_DF)

# must complete missing lat/lng values

food_inspections_DF.head()

Unnamed: 0,inspection_id,DBA_name,AKA_name,license_num,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,result,violations,lat,lng,location
0,2320830,"THE HOXTON, CHICAGO","THE HOXTON, CHICAGO",2694640.0,Restaurant,Risk 2 (Medium),200 N GREEN ST,CHICAGO,IL,60607.0,2019-10-31T00:00:00.000,License,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.885699,-87.648789,"{'latitude': '-87.64878908937915', 'longitude'..."
1,2320831,OGDEN PLAZA INC.,OGDEN PLAZA INC.,2475982.0,Grocery Store,Risk 3 (Low),3459 W OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,Canvass,Out of Business,,41.855266,-87.712402,"{'latitude': '-87.71240156240032', 'longitude'..."
2,2320829,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689756.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Not Ready,,,,
3,2320813,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689757.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,,,
4,2320757,GADS HILL CENTER,GADS HILL CENTER,2698627.0,Daycare Above and Under 2 Years,Risk 1 (High),4255-4259 S ARCHER AVE,CHICAGO,IL,60632.0,2019-10-30T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.816005,-87.700893,"{'latitude': '-87.70089338917239', 'longitude'..."


In [5]:
# Load the socio-economic indicators dataframe
socio_economic_DF = pd.read_csv(cst.SOCIO_ECONOMIC_INDICATORS_PATH, sep = ',', header = 0, 
                   names = cst.SOCIO_ECONOMIC_COL_NAMES, index_col = None, error_bad_lines=False
                   )

In [6]:
# Clean the dataframe
socio_economic_DF = clean_database.clean_socio_economic_df(socio_economic_DF)

socio_economic_DF.head()

Unnamed: 0,community_area_num,community_area_name,housing_crowded_perc,housholds_below_poverty_perc,aged_16_or_more_unemployed_perc,aged_25_or_more_without_high_school_diploma_perc,aged_under_18_or_over_64_perc,per_capita_income,hardship_idx
0,1,rogers park,7.7,23.6,8.7,18.2,27.5,23939,39.0
1,2,west ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0
2,3,uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0
3,4,lincoln square,3.4,10.9,8.2,13.4,25.5,37524,17.0
4,5,north center,0.3,7.5,5.2,4.5,26.2,57123,6.0


In [7]:
# Load the life expectancy dataframe
life_expectancy_DF = pd.read_csv(cst.LIFE_EXPECTANCY_PATH, sep = ',', header = 0, 
                   names = cst.LIFE_EXPECTANCY_COL_NAMES, index_col = None, error_bad_lines=False
                   )

In [8]:
# Clean the dataframe
life_expectancy_DF = clean_database.clean_socio_economic_df(life_expectancy_DF)

life_expectancy_DF.head()

Unnamed: 0,community_area_num,community_area_name,life_exp_1990,lower_95_perc_CI_1990,upper_95_perc_CI_1990,life_exp_2000,lower_95_perc_CI_2000,upper_95_perc_CI_2000,life_exp_2010,lower_95_perc_CI_2010,upper_95_perc_CI_2010
0,1,rogers park,70.9,69.9,71.9,73.1,72.2,74.1,77.3,76.3,78.2
1,2,west ridge,76.9,76.1,77.8,78.1,77.3,78.8,80.3,79.5,81.1
2,3,uptown,64.0,63.1,64.9,71.7,70.8,72.7,76.0,75.1,76.9
3,4,lincoln square,74.2,73.1,75.4,76.8,75.8,77.8,80.5,79.3,81.6
4,5,north center,73.4,72.1,74.7,77.9,76.6,79.1,81.5,80.1,82.8


<a id = 'complete-datasets'></a>
## Complete Datasets

### 2 problems : 
1. we only have the area name for the life exp. and the socio-eco DFs -> find the regions in sequence of lat/lng pairs that corresponds to the bounderies of an area. Then we can determine the region of the facility of the food_inspections dataframe and work only with the regions for the rest of the project (thoughts ?).
2. some entries in food_inspections_DF have no lat/lng pair -> must find it given their address

[Table of Contents](#up)

In [9]:
# merge socio-economic and life expectancy df's on the area number and names
socio_life_merged_DF = socio_economic_DF.merge(life_expectancy_DF, how="left", on=["community_area_num", "community_area_name"])# [["community_area_num", "community_area_name"]]

In [17]:
socio_life_merged_DF.head()

Unnamed: 0,community_area_num,community_area_name,housing_crowded_perc,housholds_below_poverty_perc,aged_16_or_more_unemployed_perc,aged_25_or_more_without_high_school_diploma_perc,aged_under_18_or_over_64_perc,per_capita_income,hardship_idx,life_exp_1990,lower_95_perc_CI_1990,upper_95_perc_CI_1990,life_exp_2000,lower_95_perc_CI_2000,upper_95_perc_CI_2000,life_exp_2010,lower_95_perc_CI_2010,upper_95_perc_CI_2010
0,1,rogers park,7.7,23.6,8.7,18.2,27.5,23939,39.0,70.9,69.9,71.9,73.1,72.2,74.1,77.3,76.3,78.2
1,2,west ridge,7.8,17.2,8.8,20.8,38.5,23040,46.0,76.9,76.1,77.8,78.1,77.3,78.8,80.3,79.5,81.1
2,3,uptown,3.8,24.0,8.9,11.8,22.2,35787,20.0,64.0,63.1,64.9,71.7,70.8,72.7,76.0,75.1,76.9
3,4,lincoln square,3.4,10.9,8.2,13.4,25.5,37524,17.0,74.2,73.1,75.4,76.8,75.8,77.8,80.5,79.3,81.6
4,5,north center,0.3,7.5,5.2,4.5,26.2,57123,6.0,73.4,72.1,74.7,77.9,76.6,79.1,81.5,80.1,82.8


In [15]:
# Load the areas dataframe 
areas_DF = gpd.read_file(cst.AREAS_PATH)

In [16]:
# Clean the dataframe
areas_DF = clean_database.clean_areas_df(areas_DF)

areas_DF.head()

Unnamed: 0,community_area_num,community_area_name,shape_area,shape_len,geometry
0,35,douglas,46004620.0,31027.05451,"POLYGON ((-87.60914 41.84469, -87.60915 41.844..."
1,36,oakland,16913960.0,19565.506153,"POLYGON ((-87.59215 41.81693, -87.59231 41.816..."
2,37,fuller park,19916700.0,25339.08975,"POLYGON ((-87.62880 41.80189, -87.62879 41.801..."
3,38,grand boulevard,48492500.0,28196.837157,"POLYGON ((-87.60671 41.81681, -87.60670 41.816..."
4,39,kenwood,29071740.0,23325.167906,"POLYGON ((-87.59215 41.81693, -87.59215 41.816..."


In [18]:
# First we complete the missing lng/lat pairs from the address

# TODO

# Insert the area number of all lat/lng pairs into a new column
#food_inspections_DF[cst.AREA_NUM] = food_inspections_DF.apply(lambda row: areas_handler.get_area_num_from_lng_lat(row['lat'], row['lng'], areas_DF), axis=1)

# must probably do astype(int) after

In [23]:
food_unknown_loc = food_inspections_DF[food_inspections_DF['lat'].isna()]
food_unknown_loc.head()

Unnamed: 0,inspection_id,DBA_name,AKA_name,license_num,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,result,violations,lat,lng,location
2,2320829,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689756.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Not Ready,,,,
3,2320813,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689757.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,,,
212,2320308,O'HARE GASLIGHT CLUB,GASLIGHT CLUB (HILTON O'HARE LOBBY),43114.0,Restaurant,Risk 1 (High),4000 N O'HARE FIELD,CHICAGO,IL,60666.0,2019-10-22T00:00:00.000,Canvass,Pass,43. IN-USE UTENSILS: PROPERLY STORED - Comment...,,,
712,2315467,VEENO,VEENO,2578371.0,Restaurant,Risk 3 (Low),2009 S LAFIN ST,CHICAGO,IL,60608.0,2019-10-08T00:00:00.000,License,Fail,,,,
744,2315463,VEENO,VEENO,2578370.0,Restaurant,Risk 2 (Medium),2009 S LAFIN ST,CHICAGO,IL,60608.0,2019-10-08T00:00:00.000,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",,,


In [44]:
addr = "3459 S OGDEN AVE"
#clean_addr = clean_address(addr)
#print(clean_addr)

areas_handler.get_lat_lng_from_address(addr) # 41.863365, -87.6899948

[41.8551952, -87.7121941821811]

In [51]:
# true lat/lng on google maps : 41.866913, -87.683636
# so result from google maps web parsing is not exact...


# try to remove "S", "AVE", "N",... and request it on openstreetmap
unknown_str = ['S', 'N', 'W', 'E', 'AVE', 'ST']
test = food_unknown_loc.copy()
test['new_address'] = food_unknown_loc['address'][:10].apply(lambda addr: areas_handler.get_lat_lng_from_address(addr))

In [53]:
test[:10]

Unnamed: 0,inspection_id,DBA_name,AKA_name,license_num,facility_type,risk,address,city,state,zip,inspection_date,inspection_type,result,violations,lat,lng,location,new_address
2,2320829,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689756.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Not Ready,,,,,"[41.863365, -87.6899948]"
3,2320813,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689757.0,Grocery Store,Risk 3 (Low),3455-3459 S OGDEN AVE,CHICAGO,IL,60623.0,2019-10-31T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,,,,"[41.863365, -87.6899948]"
212,2320308,O'HARE GASLIGHT CLUB,GASLIGHT CLUB (HILTON O'HARE LOBBY),43114.0,Restaurant,Risk 1 (High),4000 N O'HARE FIELD,CHICAGO,IL,60666.0,2019-10-22T00:00:00.000,Canvass,Pass,43. IN-USE UTENSILS: PROPERLY STORED - Comment...,,,,"[None, None]"
712,2315467,VEENO,VEENO,2578371.0,Restaurant,Risk 3 (Low),2009 S LAFIN ST,CHICAGO,IL,60608.0,2019-10-08T00:00:00.000,License,Fail,,,,,"[None, None]"
744,2315463,VEENO,VEENO,2578370.0,Restaurant,Risk 2 (Medium),2009 S LAFIN ST,CHICAGO,IL,60608.0,2019-10-08T00:00:00.000,License,Fail,"3. MANAGEMENT, FOOD EMPLOYEE AND CONDITIONAL E...",,,,"[None, None]"
1512,2312470,Carson (Old),Carson (Old),22601.0,School,Risk 1 (High),5516 S Maplewood (2532W) AVE,chicago,IL,60629.0,2019-09-18T00:00:00.000,Canvass Re-Inspection,Pass,"55. PHYSICAL FACILITIES INSTALLED, MAINTAINED ...",,,,"[None, None]"
1608,2312357,"PACINO'S RC, LLC",PACINOS,2683667.0,Restaurant,Risk 1 (High),1010 S DELANO CT,CHICAGO,IL,60605.0,2019-09-16T00:00:00.000,License Re-Inspection,Pass,,,,,"[None, None]"
1635,2312360,"PACINO'S RC, LLC",PACINOS,2683669.0,Restaurant,Risk 3 (Low),1010 S DELANO CT,CHICAGO,IL,60605.0,2019-09-16T00:00:00.000,License Re-Inspection,Pass,,,,,"[None, None]"
1637,2312358,"PACINO'S RC, LLC",PACINOS,2683668.0,Restaurant,Risk 3 (Low),1010 S DELANO CT,CHICAGO,IL,60605.0,2019-09-16T00:00:00.000,License Re-Inspection,Pass,,,,,"[None, None]"
1841,2312072,Carson (Old),Carson (Old),22601.0,School,Risk 1 (High),5516 S Maplewood (2532W) AVE,chicago,IL,60629.0,2019-09-11T00:00:00.000,Canvass,Fail,10. ADEQUATE HANDWASHING SINKS PROPERLY SUPPLI...,,,,"[None, None]"


In [None]:
# 4000 N O'HARE FIELD not recognized, but 4000 O'HARE is.
# 2009 S LAFIN ST : should be lafLin street, not lafin street...

<a id = 'basic-stats'></a>
## Basic Statistics

We report some statistics on the various dataframes.

[Table of Contents](#up)