In [12]:
import pandas as pd
import numpy as np
import scipy.stats as stat
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import math
from collections import defaultdict

In [13]:
from research.strings import TARGET, INSPECTION_ID, DATE, PREDICTORS, VIOLATIONS, MONTHS
from research.evaluation import evaluate_model
from research.utils import get_crit_mat
from research.metrics import to_inspection_daynum, get_days_by_score

In [14]:
df_canvass = pd.read_csv("data/canvass_records.csv.gz")
df_complaint = pd.read_csv("data/complaint_records.csv.gz")
df_canvass["aka_name"] = df_canvass["aka_name"].fillna("NULL_NAME")
df_complaint["aka_name"] = df_complaint["aka_name"].fillna("NULL_NAME")
df_canvass["critical_found"] = df_canvass[VIOLATIONS].max(axis=1)
df_complaint["critical_found"] = df_complaint[VIOLATIONS].max(axis=1)
print("Read {} canvass records.".format(len(df_canvass)))
print("Read {} complaint records.".format(len(df_complaint)))

Read 50462 canvass records.
Read 17088 complaint records.


In [24]:
start_date = "2014-09-01"
end_date = "2014-11-01"
date_query = "inspection_date >= '{}' and inspection_date < '{}'".format(start_date, end_date)
test_canv = df_canvass.query(date_query)
test_comp = df_complaint.query(date_query)
print("Found {} canvass records during test set.".format(len(test_canv)))
print("Found {} complaint records during test set.".format(len(test_comp)))

Found 1368 canvass records during test set.
Found 352 complaint records during test set.


In [16]:
# Load original test set
test_info = pd.read_csv("data/inspections_test.csv")
print("Read {} records from the original test set.".format(len(test_info)))

# Extract critical violation codes
crit_mat = get_crit_mat(test_info)
print("Found {} critical violation labels.".format(len(crit_mat.columns)))

# Merge on inspection ID
inspection_id = INSPECTION_ID
crit_mat[inspection_id] = test_info[inspection_id]
merged = test_info.set_index(inspection_id).join(crit_mat.set_index(inspection_id))
d_test = merged.reset_index()
d_test["license_id"] = d_test["license_id"].astype(int)
print("Merged Test Set: {} records.".format(len(d_test)))

Read 1637 records from the original test set.
Found 14 critical violation labels.
Merged Test Set: 1637 records.


In [17]:
y_test = d_test[TARGET]
X_test = d_test[PREDICTORS]
clf = pickle.load(open("models/sklearn-glm.pkl", "rb"))
y_pred = clf.predict(X_test)
y_score = clf.predict_proba(X_test)[:,1]
evaluate_model(y_test, y_pred)

F1 Score = 0.08664
Precision = 0.63158
Recall = 0.04651


Unnamed: 0,Predicted +,Predicted -
Actual +,12,246
Actual -,7,1372


In [18]:
days = to_inspection_daynum(d_test[DATE])
days

array([10, 25, 32, ..., 60, 56, 60])

In [20]:
get_days_by_score(days, y_score)

array([14, 25, 37, ..., 59, 60, 37])

In [21]:
get_days_by_score(d_test["inspection_date"], y_score)

array(['2014-09-15T00:00:00.000', '2014-09-26T00:00:00.000',
       '2014-10-08T00:00:00.000', ..., '2014-10-30T00:00:00.000',
       '2014-10-31T00:00:00.000', '2014-10-08T00:00:00.000'], dtype=object)

In [26]:
lids_test = set(d_test["license_id"])
lids_comp = set(test_comp["license_id"])
lids_ix = lids_test.intersection(lids_comp)
len(lids_ix)

14

In [27]:
d_test["new_date"] = get_days_by_score(d_test["inspection_date"], y_score)
mask = [lid in lids_ix for lid in d_test["license_id"].values]
focus = d_test[mask][["license_id", "aka_name", TARGET, "inspection_date", "new_date"]]
old_date = focus["inspection_date"].astype(np.datetime64)
new_date = focus["new_date"].astype(np.datetime64)
focus["days_earlier"] = (old_date - new_date).apply(lambda dt: dt.days)
focus

Unnamed: 0,license_id,aka_name,criticalFound,inspection_date,new_date,days_earlier
30,1820926,DUNKIN DONUTS,1.0,2014-09-25T00:00:00.000,2014-09-22T00:00:00.000,3
100,2060861,MURRAY'S FISH AND CHICKEN,0.0,2014-09-24T00:00:00.000,2014-10-27T00:00:00.000,-33
254,56642,DUKES,0.0,2014-09-22T00:00:00.000,2014-10-28T00:00:00.000,-36
325,2114331,FRONTERA TORTAS BY RICK BAYLESS (T3 K4),0.0,2014-09-12T00:00:00.000,2014-10-03T00:00:00.000,-21
387,9278,SHAW'S CRAB HOUSE,0.0,2014-09-04T00:00:00.000,2014-09-09T00:00:00.000,-5
400,1650274,JOY YEE NOODLE,1.0,2014-09-15T00:00:00.000,2014-09-08T00:00:00.000,7
565,1168258,THE BEER BISTRO,0.0,2014-09-16T00:00:00.000,2014-09-26T00:00:00.000,-10
760,2088536,CARAVAN,1.0,2014-10-02T00:00:00.000,2014-09-04T00:00:00.000,28
854,1905309,PONCE,0.0,2014-09-08T00:00:00.000,2014-09-29T00:00:00.000,-21
859,33144,MCDONALD'S #11290,1.0,2014-09-08T00:00:00.000,2014-10-02T00:00:00.000,-24


In [28]:
comp_mask = [lid in lids_ix for lid in test_comp["license_id"].values]
test_comp[comp_mask][["license_id", "aka_name", "critical_found", "inspection_date"]].sort_values("inspection_date")

Unnamed: 0,license_id,aka_name,critical_found,inspection_date
2791,9278,SHAW'S CRAB HOUSE,1,2014-09-19T00:00:00.000
4061,9278,SHAW'S CRAB HOUSE,1,2014-09-23T00:00:00.000
21,2114331,FRONTERA TORTAS BY RICK BAYLESS (T3 K4),0,2014-09-26T00:00:00.000
15369,1168258,THE BEER BISTRO,0,2014-10-03T00:00:00.000
15492,9278,SHAW'S CRAB HOUSE,1,2014-10-09T00:00:00.000
887,9278,SHAW'S CRAB HOUSE,0,2014-10-09T00:00:00.000
10887,1820926,DUNKIN DONUTS,0,2014-10-15T00:00:00.000
12636,82530,MUI'S FEIDA BAKERY,0,2014-10-16T00:00:00.000
3609,1905309,PONCE,0,2014-10-20T00:00:00.000
14389,2088536,CARAVAN,1,2014-10-21T00:00:00.000


In [29]:
lids_canv = set(test_canv["license_id"])
lids_comp = set(test_comp["license_id"])
lids_ix = lids_canv.intersection(lids_comp)
print("Found {} unique license IDs in test set canvass records.".format(len(lids_canv)))
print("Found {} unique license IDs in test set complaint records.".format(len(lids_comp)))
print("Intersection = {} unique license IDs.".format(len(lids_ix)))

Found 1364 unique license IDs in test set canvass records.
Found 330 unique license IDs in test set complaint records.
Intersection = 13 unique license IDs.


In [30]:
test_canv["critical_found"].sum()

222

In [31]:
mask = [lid in lids_ix for lid in test_canv["license_id"].values]
test_canv[mask][["license_id", "aka_name", "critical_found", "inspection_date"]]

Unnamed: 0,license_id,aka_name,critical_found,inspection_date
207,33144,MCDONALD'S #11290,1,2014-09-08T00:00:00.000
5396,23051,STARBUCKS COFFEE,0,2014-10-07T00:00:00.000
6118,1168258,THE BEER BISTRO,0,2014-09-16T00:00:00.000
8722,9278,SHAW'S CRAB HOUSE,0,2014-09-04T00:00:00.000
10167,2114331,FRONTERA TORTAS BY RICK BAYLESS (T3 K4),0,2014-09-12T00:00:00.000
16580,1820926,DUNKIN DONUTS,1,2014-09-25T00:00:00.000
18405,2088536,CARAVAN,1,2014-10-02T00:00:00.000
19028,1905309,PONCE,0,2014-09-08T00:00:00.000
20104,82530,MUI'S FEIDA BAKERY,0,2014-09-16T00:00:00.000
20666,1650274,JOY YEE NOODLE,1,2014-09-15T00:00:00.000


In [32]:
mask = [lid in lids_ix for lid in test_comp["license_id"].values]
test_comp[mask][["license_id", "aka_name", "critical_found", "inspection_date"]]

Unnamed: 0,license_id,aka_name,critical_found,inspection_date
21,2114331,FRONTERA TORTAS BY RICK BAYLESS (T3 K4),0,2014-09-26T00:00:00.000
887,9278,SHAW'S CRAB HOUSE,0,2014-10-09T00:00:00.000
2791,9278,SHAW'S CRAB HOUSE,1,2014-09-19T00:00:00.000
3609,1905309,PONCE,0,2014-10-20T00:00:00.000
4061,9278,SHAW'S CRAB HOUSE,1,2014-09-23T00:00:00.000
4283,23051,STARBUCKS COFFEE,0,2014-10-24T00:00:00.000
4890,1650274,JOY YEE NOODLE,0,2014-10-30T00:00:00.000
5201,69963,VALLARTA,0,2014-10-31T00:00:00.000
5392,2060861,MURRAY'S FISH AND CHICKEN,0,2014-10-23T00:00:00.000
10887,1820926,DUNKIN DONUTS,0,2014-10-15T00:00:00.000


In [33]:
lids_ix

{9278,
 23051,
 33144,
 56642,
 69963,
 82530,
 1168258,
 1650274,
 1820926,
 1905309,
 2060861,
 2088536,
 2114331}

In [34]:
test_canv.query("license_id == 9278")["violations"].values[0]

'34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: OBSERVED GROUT WORN BETWEEN FLOOR TILES IN THE DISH ROOM, SUSHI AND OYSTER BAR COOK LINE. INSTRUCTED TO PROVIDE AND MAINTAIN GROUT BETWEEN  FLOOR TILES. | 35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: OBSERVED THE WALL IN DISREPAIR IN THE MOP SINK CLOSET. INSTRUCTED TO REPAIR AND MAINTAIN ALL WALLS. | 36. LIGHTING: REQUIRED MINIMUM FOOT-CANDLES OF LIGHT PROVIDED, FIXTURES SHIELDED - Comments: OBSERVED BLOWN LIGHT BULBS UNDER THE REAR BROILER AND STEAMER HOOD. INSTRUCTED TO PROVIDE AND MAINTAIN ADEQUATE LIGHTING. | 38. VENTILATION: ROOMS AND EQUIPMENT VENTED AS REQUIRED: PLUMBING: INSTALLED AND MAINTAINED - Comments: OBSERVED THE PLUMBING LEAKING AT THE SALAD PREP EXPOSED HANDSINK. INSTRUCTED TO REPAIR AND MAINTAIN ALL PLUMBING. | 42. APPROPRIATE METHOD OF HANDLING OF FOOD (ICE) HAIR 

In [35]:
test_comp.query("license_id == 9278")["violations"].values[0]

'34. FLOORS: CONSTRUCTED PER CODE, CLEANED, GOOD REPAIR, COVING INSTALLED, DUST-LESS CLEANING METHODS USED - Comments: OBSERVED GROUT WORN BETWEEN FLOOR TILES IN THE DISH ROOM, SUSHI AND OYSTER BAR COOK LINE. INSTRUCTED TO PROVIDE AND MAINTAIN GROUT BETWEEN FLOOR TILES.    | 35. WALLS, CEILINGS, ATTACHED EQUIPMENT CONSTRUCTED PER CODE: GOOD REPAIR, SURFACES CLEAN AND DUST-LESS CLEANING METHODS - Comments: OBSERVED THE WALL IN DISREPAIR IN THE MOP SINK CLOSET. INSTRUCTED TO REPAIR AND MAINTAIN ALL WALLS.   '