In [1]:
import re
import pandas as pd
import string
import nltk
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn import metrics

In [2]:
df_inspect = pd.read_csv("Food_Inspections.csv")

In [3]:
df_inspect = df_inspect.dropna()
df_inspect.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
0,2144506,CATALINA PIZZERIA,CATALINA PIZZERIA,2385764.0,Restaurant,Risk 1 (High),5337 W BELMONT AVE,CHICAGO,IL,60641.0,02/02/2018,Canvass Re-Inspection,Pass,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.938539,-87.760631,"(41.9385394320182, -87.7606308007806)"
1,2144504,TAQUERIA EL RANCHITO,TAQUERIA EL RANCHITO,60184.0,Restaurant,Risk 1 (High),2829 N MILWAUKEE AVE,CHICAGO,IL,60618.0,02/02/2018,Canvass Re-Inspection,Pass,31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...,41.932657,-87.713056,"(41.93265693660558, -87.71305610116421)"
2,2144502,NEW LITTLE CHINA,NEW LITTLE CHINA,1844410.0,Restaurant,Risk 1 (High),1737 E 95TH ST,CHICAGO,IL,60617.0,02/02/2018,Canvass,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.722285,-87.581072,"(41.72228503083544, -87.58107166595508)"
3,2144496,GOLO,GOLO,1974172.0,Grocery Store,Risk 3 (Low),10655 S HALSTED ST,CHICAGO,IL,60628.0,02/02/2018,Complaint,Pass,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,41.699847,-87.642467,"(41.699846928380765, -87.64246730125315)"
5,2144494,DOMINO'S PIZZA,DOMINO'S PIZZA,35407.0,Restaurant,Risk 2 (Medium),3144 W DEVON AVE,CHICAGO,IL,60659.0,02/02/2018,Suspected Food Poisoning,Pass w/ Conditions,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",41.99751,-87.708716,"(41.997509636592596, -87.70871560955086)"


In [4]:
df_inspect.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 129519 entries, 0 to 164705
Data columns (total 17 columns):
Inspection ID      129519 non-null int64
DBA Name           129519 non-null object
AKA Name           129519 non-null object
License #          129519 non-null float64
Facility Type      129519 non-null object
Risk               129519 non-null object
Address            129519 non-null object
City               129519 non-null object
State              129519 non-null object
Zip                129519 non-null float64
Inspection Date    129519 non-null object
Inspection Type    129519 non-null object
Results            129519 non-null object
Violations         129519 non-null object
Latitude           129519 non-null float64
Longitude          129519 non-null float64
Location           129519 non-null object
dtypes: float64(4), int64(1), object(12)
memory usage: 17.8+ MB


In [5]:
# convert label to a numerical variable
df_inspect['Results_flag'] = df_inspect['Results'].map({'Fail':0, 'Pass':1, 'Out of Business':2,'Pass w/ Conditions':3})

In [6]:
df_inspect['Results_flag'].head()

0    1.0
1    1.0
2    1.0
3    1.0
5    3.0
Name: Results_flag, dtype: float64

In [7]:
df_reason = df_inspect[['Violations','Results_flag']]

In [8]:
df_reason = df_reason.loc[df_reason['Results_flag'].isin([1,0])]
df_reason.head(6)

Unnamed: 0,Violations,Results_flag
0,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,1.0
1,31. CLEAN MULTI-USE UTENSILS AND SINGLE SERVIC...,1.0
2,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,1.0
3,33. FOOD AND NON-FOOD CONTACT EQUIPMENT UTENSI...,1.0
6,"11. ADEQUATE NUMBER, CONVENIENT, ACCESSIBLE, D...",1.0
8,2. FACILITIES TO MAINTAIN PROPER TEMPERATURE -...,0.0


In [9]:
def violation2comments(d):
    comments = []
    if not pd.isnull(d): # filter out nan in the series
        items = d.split(' | ') # causes are separated by |
        for text in items: # for each cause
            if re.match(r'^[0-9]+\.', text): # the cause must start with some numbers with a dot
                comment = text.split(' - Comments:')[1] # keep the comments part in the cause
                comment = re.sub(r'\n\w+ VIOLATION.*$', " ", comment.strip())
                comments.append(comment.strip())
    return " ".join(comments)

In [10]:
df_reason["Comments"] = df_reason["Violations"].map(violation2comments)

In [11]:
df_comments = df_reason[['Comments','Results_flag']]
df_comments.head()

Unnamed: 0,Comments,Results_flag
0,CORRECTED (RODENT DROPPINGS HAS BEEN REMOVED)....,1.0
1,OBSERVED CUPS USED TO DISPENSE FOOD IN THE CON...,1.0
2,MUST CLEAN AND MAINTAIN THE FOLLOWING: INTERI...,1.0
3,"CLEAN INTERIOR AND EXTERIOR OF EXPOSED SINK, 3...",1.0
6,VIOLATION CORRECTED\nEXPOSED HAND SINK INSTALL...,1.0


In [12]:
X = df_comments.Comments
y = df_comments.Results_flag
print(X.shape)
print(y.shape)

(113824,)
(113824,)


In [13]:
# split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(85368,)
(28456,)
(85368,)
(28456,)


### Countvectorizer

In [80]:
countvectorizer = CountVectorizer(stop_words='english',min_df=2,max_df=0.5)
countvectorizer_matrix = countvectorizer.fit_transform(X_train)
countvectorizer_matrix.shape

(85368, 13643)

In [81]:
countvectorizer_matrix_df = pd.DataFrame(countvectorizer_matrix.toarray(), columns=countvectorizer.get_feature_names())

In [82]:
countvectorizer_matrix_df.head()

Unnamed: 0,00,000,000362,0005,0005a,001,002,003,0030,003622,...,zebra,zero,zerox,zip,zipcode,ziploc,ziplock,zipping,zone,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [121]:
# valid_col = [col for col in countvectorizer_matrix_df.columns if not col.isdigit()]

In [83]:
# remove column names starting with digits
my_list = countvectorizer_matrix_df.columns
valid_col = [col for col in my_list if not any(c.isdigit() for c in col)]

In [84]:
# keep strings with length greater than three
valid_col = [col for col in valid_col if len(col)>3]

In [85]:
# Lemmation
wnl = nltk.WordNetLemmatizer()
valid_col = [wnl.lemmatize(col) for col in valid_col]

In [86]:
valid_col = [x[1:] for x in valid_col if (x[0] == x[1] and len(x)>2)]
valid_col = [col for col in valid_col if len(col)>3]
valid_col

['area',
 'basement',
 'ceiling',
 'clean',
 'closet',
 'coler',
 'colers',
 'compartment',
 'container',
 'cooler',
 'equipment',
 'evated',
 'evating',
 'fice',
 'from',
 'instructed',
 'light',
 'must',
 'oler',
 'ozing',
 'prep',
 'proper',
 'provide',
 'remove',
 'rodent',
 'rpair',
 'sanitize',
 'serious',
 'service',
 'sink',
 'storage',
 'throiughout',
 'throughout',
 'uper',
 'used',
 'violation']

In [94]:
X_train_clean = countvectorizer_matrix_df.loc[:,valid_col]

In [95]:
X_train_clean = clean_col.fillna(0)
X_train_clean.head()

Unnamed: 0,abated,accumuation,amir,area,aron,around,base,basement,basewall,bhind,...,violations,walk,washing,washrooms,west,with,without,witout,wlls,wshrooms
0,0,0,0.0,1,0.0,0.0,0,0,0,0,...,0,2,0,0,0,0.0,0.0,0,0,0
1,0,0,0.0,4,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0,0,0
2,0,0,0.0,0,0.0,0.0,0,0,0,0,...,0,1,0,0,0,0.0,0.0,0,0,0
3,0,0,0.0,2,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0,0,0
4,0,0,0.0,1,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0,0,0


In [91]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = countvectorizer.transform(X_test)
X_test_dtm = pd.DataFrame(X_test_dtm.toarray(), columns=countvectorizer.get_feature_names())
X_test_clean = X_test_dtm.loc[:,x_train_clean.columns]

In [93]:
X_test_clean = X_test_clean.fillna(0)
X_test_clean.head()

Unnamed: 0,abated,accumuation,amir,area,aron,around,base,basement,basewall,bhind,...,violations,walk,washing,washrooms,west,with,without,witout,wlls,wshrooms
0,0,0,0.0,0.0,0.0,0.0,0,1,0,0,...,0,2,0,0,0,0.0,0.0,0,0,0.0
1,0,0,0.0,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0,0,0.0
2,0,0,0.0,0.0,0.0,0.0,0,3,0,0,...,0,0,0,0,1,0.0,0.0,0,0,0.0
3,0,0,0.0,0.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0,0,0.0
4,0,0,0.0,0.0,0.0,0.0,0,0,0,0,...,0,1,3,1,0,0.0,0.0,0,0,0.0


### Logistic Regression

In [96]:
# instantiate a logistic regression model
logreg = LogisticRegression()

In [97]:
# train the model using X_train_dtm
%time logreg.fit(X_train_clean, y_train)

CPU times: user 3.21 s, sys: 457 ms, total: 3.67 s
Wall time: 4.25 s


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [98]:
# make class predictions for X_test_dtm
y_pred_class = logreg.predict(X_test_clean)

In [99]:
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred_class))

0.875527129604


In [100]:
# calculate precision and recall
print(classification_report(y_test, y_pred_class))

             precision    recall  f1-score   support

        0.0       0.85      0.61      0.71      7071
        1.0       0.88      0.96      0.92     21385

avg / total       0.87      0.88      0.87     28456



### Random Forest

In [101]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(X_train_clean, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [102]:
rfc_pred = rfc.predict(X_test_clean)

In [103]:
# calculate accuracy of class predictions
print(metrics.accuracy_score(y_test, y_pred_class))

0.875527129604


In [105]:
# calculate precision and recall
print(classification_report(y_test,rfc_pred))

             precision    recall  f1-score   support

        0.0       0.89      0.75      0.81      7071
        1.0       0.92      0.97      0.94     21385

avg / total       0.91      0.91      0.91     28456

