In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
from requests import get
from bs4 import BeautifulSoup
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

**Dataframe Preparation**

In [6]:
food = pd.read_csv('data/food-inspections.csv', sep=',')

In [7]:
food = food.drop(['Address','City', 'State','Zip', 'Zip Codes','Historical Wards 2003-2015', 'Community Areas', 'Census Tracts','Wards','Location'], axis=1)

In [8]:
food.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2320830,"THE HOXTON, CHICAGO","THE HOXTON, CHICAGO",2694640.0,Restaurant,Risk 2 (Medium),2019-10-31T00:00:00.000,License,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.885699,-87.648789
1,2320831,OGDEN PLAZA INC.,OGDEN PLAZA INC.,2475982.0,Grocery Store,Risk 3 (Low),2019-10-31T00:00:00.000,Canvass,Out of Business,,41.855266,-87.712402
2,2320829,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689756.0,Grocery Store,Risk 3 (Low),2019-10-31T00:00:00.000,License,Not Ready,,,
3,2320813,PLAZA FOOD AND LIQUOR,PLAZA FOOD AND LIQUOR,2689757.0,Grocery Store,Risk 3 (Low),2019-10-31T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,,
4,2320757,GADS HILL CENTER,GADS HILL CENTER,2698627.0,Daycare Above and Under 2 Years,Risk 1 (High),2019-10-30T00:00:00.000,License,Fail,5. PROCEDURES FOR RESPONDING TO VOMITING AND D...,41.816005,-87.700893


**Cleaning steps**

1) Drop rows without location.

2) Filter only the 4 facility types we are going to analyse: restaurants, grocery stores, schools and hospitals.

3) Convert the date format to an analysis friendly format

4) For restaurants, we are going to focus only the 5 most inspected chains of different types: McDonald's, Subway, Taco Bell, Satrbucks and Dunkin Donuts.

**Step 1**

In [9]:
food.Latitude.isna().any() or food.Longitude.isna().any()

True

In [10]:
food.dropna(subset = ["Latitude", "Longitude"], inplace=True)
food.Latitude.isna().any() or food.Longitude.isna().any()

False

**Step 2**

In [26]:
food = food[food["Facility Type"].isin(["Restaurant","Grocery Store", "School", "Hospital"])]
food["Facility Type"].value_counts()

Restaurant       129891
Grocery Store     24838
School            11808
Hospital            537
Name: Facility Type, dtype: int64

**Step 3**

In [27]:
food["Inspection Date"] = food["Inspection Date"].str.split("-").str[0]
food.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude
0,2320830,"THE HOXTON, CHICAGO","THE HOXTON, CHICAGO",2694640.0,Restaurant,Risk 2 (Medium),2019,License,Pass,36. THERMOMETERS PROVIDED & ACCURATE - Comment...,41.885699,-87.648789
1,2320831,OGDEN PLAZA INC.,OGDEN PLAZA INC.,2475982.0,Grocery Store,Risk 3 (Low),2019,Canvass,Out of Business,,41.855266,-87.712402
6,2320795,TBD,THE EXCHANGE,2698572.0,Restaurant,Risk 1 (High),2019,License,Pass,,41.887529,-87.632647
7,2320768,The Manor,The Manor,22971.0,Restaurant,Risk 1 (High),2019,Canvass,No Entry,,41.807924,-87.728164
9,2320719,"4884 S ARCHER INC,.","4884 S ARCHER INC,.",2678088.0,Grocery Store,Risk 3 (Low),2019,License,Fail,,41.804621,-87.719907


In [28]:
#Check for missing years
food["Inspection Date"].isna().any()

False

**Step 4**