In [10]:
import pandas as pd
import numpy as np
import scipy.stats as stat
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import math
import textdistance
import string

In [3]:
from research.strings import VIOLATIONS, MONTHS

In [37]:
df_canvass = pd.read_csv("data/canvass_records.csv.gz")
df_canvass["critical_found"] = df_canvass[VIOLATIONS].max(axis=1)
print("Read {} canvass records.".format(len(df_canvass)))

Read 50462 canvass records.


In [38]:
def clean_name(original, backup, fallback="nullname"):
    name = fallback
    if not pd.isna(original):
        name = original
    elif not pd.isna(backup):
        name = backup
    clean = name.casefold()
    for p in list(string.punctuation):
        clean = clean.replace(p, "")
    clean = clean.replace(" ", "_")
    return clean

In [39]:
name_opts = df_canvass[["aka_name", "dba_name"]].to_dict(orient="records")
df_canvass["clean_name"] = [clean_name(n["aka_name"], n["dba_name"]) for n in name_opts]

In [172]:
top_names = df_canvass.groupby("clean_name")["inspection_id"].count().sort_values(ascending=False)
all_names = list(top_names.index)
top_names.head(70)

clean_name
subway                           1822
dunkin_donuts                     589
jimmy_johns                       230
mcdonalds                         204
chipotle_mexican_grill            192
starbucks_coffee                  176
potbelly_sandwich_works           168
corner_bakery_cafe                141
au_bon_pain                       141
subway_sandwiches                 126
potbelly_sandwich_works_llc       118
dunkin_donutsbaskin_robbins       116
starbucks                         113
dominos_pizza                     102
freshii                           100
popeyes                            88
dunkin_donuts_baskin_robbins       80
mc_donalds                         76
burger_king                        73
taco_bell                          69
see_thru_chinese_kitchen           69
protein_bar                        68
wendys                             67
dunkin_donuts__baskin_robbins      61
pizza_hut                          61
pockets                            60
p

In [171]:
search = "afc_sushi"
for name in all_names:
    sim = textdistance.jaro_winkler(search, name)
    if sim > 0.85:
        print(name)

afc_sushi
afc_sushijewelosco_3241
afc_sushi__walgreens
afc_sushi__dominicks
kai_sushi
afc_sushi__jewel_osco
ra_sushi
afc_sushi__marshall_fields
afc_sushi__university_center
afc_sushidominicks_1113
afc_sushi__dominicks_1100
afc_sushi__cafe_300
afc_sushiuniversity_of_chicago
afc_sushi__jewel_osco_3224


In [49]:
textdistance.jaro("potbelly_sandwich_works", "potbelly_sandwich_works_llc")

0.9506172839506174

In [168]:
df_canvass.query("clean_name == 'main_kitchen__great'")

Unnamed: 0,inspection_id,address,aka_name,city,dba_name,facility_type,inspection_date,inspection_type,latitude,license_id,...,V7,V8,V9,V10,V11,V12,V13,V14,critical_found,clean_name
11014,251222,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2010-06-23T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,0,main_kitchen__great
15781,2099137,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2017-10-19T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,0,main_kitchen__great
20170,1578216,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2015-09-28T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,1,main_kitchen__great
32753,1995219,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2017-03-08T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,0,main_kitchen__great
36171,1138777,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2013-01-22T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,0,main_kitchen__great
44493,1372362,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2014-01-29T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,0,main_kitchen__great
44837,1689252,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2016-03-08T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,0,main_kitchen__great
46029,567309,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2011-03-25T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,1,main_kitchen__great
49705,2160126,1 W WACKER DR,MAIN KITCHEN / GREAT,CHICAGO,RENAISSANCE CHICAGO HOTEL,Restaurant,2018-04-16T00:00:00.000,Canvass,41.886704,48221,...,0,0,0,0,0,0,0,0,1,main_kitchen__great
