In [1]:
import re
import pandas as pd
import string
import nltk

In [2]:
df_inspect = pd.read_csv("Food_Inspections.csv")

In [4]:
df_fail = df_inspect[df_inspect['Results'] == 'Fail']
df_fail = df_fail.dropna()

In [5]:
df_fail.head()

Unnamed: 0,Inspection ID,DBA Name,AKA Name,License #,Facility Type,Risk,Address,City,State,Zip,Inspection Date,Inspection Type,Results,Violations,Latitude,Longitude,Location
5,2135353,"SQUARE ROOTS KITCHEN, LLC","SQUARE ROOTS KITCHEN, LLC",2574626.0,Restaurant,Risk 1 (High),120 S HALSTED ST,CHICAGO,IL,60661.0,01/12/2018,License,Fail,"16. FOOD PROTECTED DURING STORAGE, PREPARATION...",41.879822,-87.647448,"(41.879821521357705, -87.64744848057362)"
11,2135337,USMANIA RESTAURANT,USMANIA RESTAURANT,1047084.0,Restaurant,Risk 1 (High),2253 W DEVON AVE,CHICAGO,IL,60659.0,01/11/2018,Canvass,Fail,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,41.997669,-87.68707,"(41.99766945661302, -87.68706951063783)"
13,2135314,NORTH GARDEN,NORTH GARDEN,2368358.0,Restaurant,Risk 1 (High),1007 W ARGYLE ST,CHICAGO,IL,60640.0,01/11/2018,Canvass,Fail,18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...,41.973198,-87.655248,"(41.97319805257552, -87.65524806242914)"
16,2135308,DMK BURGER BAR,DMK BURGER BAR,1943629.0,Restaurant,Risk 1 (High),2954 N SHEFFIELD AVE,CHICAGO,IL,60657.0,01/11/2018,Complaint,Fail,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,41.936097,-87.654125,"(41.936096824618446, -87.6541247506545)"
17,2135303,CONVENIENT FOOD MART,CONVENIENT FOOD MART,2074199.0,Grocery Store,Risk 3 (Low),2850 N SHERIDAN RD,CHICAGO,IL,60657.0,01/11/2018,Complaint,Fail,3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...,41.934271,-87.639488,"(41.93427051907201, -87.63948842747087)"


In [6]:
df = df_fail['Violations']
df.head(10)

5     16. FOOD PROTECTED DURING STORAGE, PREPARATION...
11    3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...
13    18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
16    3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...
17    3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...
35    29. PREVIOUS MINOR VIOLATION(S) CORRECTED 7-42...
39    3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...
41    18. NO EVIDENCE OF RODENT OR INSECT OUTER OPEN...
42    3. POTENTIALLY HAZARDOUS FOOD MEETS TEMPERATUR...
46    9. WATER SOURCE: SAFE, HOT & COLD UNDER CITY P...
Name: Violations, dtype: object

In [56]:
cause = ""
for d in df.values:
    if not pd.isnull(d): # filter out nan in the series
        items = d.split(' | ') # causes are separated by |
        for text in items: # for each cause
            if re.match(r'^[0-9]+\.', text): # the cause must start with some numbers with a dot
                cause1 = text.split(' - Comments:')[1] # keep the comments part in the cause
                cause1 = re.sub(r'\n\w+ VIOLATION.*$', " ", cause1.strip())
                cause+=str(cause1)

In [63]:
cause[0:300]

'FOOD NOT PROTECTED. FRONT PREP COUNTER NOT PROTECTED WITH GLASS SHIELDING. INSTRUCTED MANAGER TO PROVIDE TO ENSURE FRONT PREP COUNTER IS PROTECTED. \n OBSERVED NO SPLASH GUARD LOCATED ON LEFT SIDE OF EXPOSED HAND SINK, LOCATED IN FRONT PREP AREA. INSTRUCTED MANAGER TO PROVIDE.OBSERVED RAW UNFINISHED '

In [64]:
raw_words = nltk.word_tokenize(cause)

In [65]:
stopwords = set(nltk.corpus.stopwords.words('english'))

# Remove single-character tokens (mostly punctuation)
raw_words = [word for word in raw_words if len(word) > 1]

# Remove punctuation
raw_words = [word for word in raw_words if word.isalpha()]

# Lowercase all words (default_stopwords are lowercase too)
raw_words = [word.lower() for word in raw_words]

# Lemmation
wnl = nltk.WordNetLemmatizer()
raw_words = [wnl.lemmatize(t) for t in words]

# Remove stopwords
raw_words = [word for word in raw_words if word not in stopwords]

In [70]:
fdist = nltk.FreqDist(raw_words)

fdist.most_common(10)

[('food', 82955),
 ('area', 73922),
 ('must', 71092),
 ('clean', 69695),
 ('floor', 54989),
 ('sink', 52144),
 ('instructed', 50322),
 ('prep', 47933),
 ('wall', 39369),
 ('shall', 38006)]

In [71]:
fdist_df = pd.DataFrame(fdist.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,food,82955
1,area,73922
2,must,71092
3,clean,69695
4,floor,54989
5,sink,52144
6,instructed,50322
7,prep,47933
8,wall,39369
9,shall,38006


### N-grams

In [72]:
#Create bigrams or trigrams
bgs = nltk.bigrams(raw_words)
tgs = nltk.trigrams(raw_words)

In [73]:
#compute frequency distribution for all the bigrams in the text
fdist_2 = nltk.FreqDist(bgs)
fdist_3 = nltk.FreqDist(tgs)

In [74]:
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(prep, area)",23142
1,"(serious, violation)",13486
2,"(compartment, sink)",13386
3,"(pest, control)",12260
4,"(citation, issued)",10333
5,"(detail, clean)",10028
6,"(hand, sink)",9555
7,"(storage, area)",9428
8,"(hazardous, food)",9322
9,"(smooth, easily)",9291


In [75]:
fdist_df3 = pd.DataFrame(fdist_3.most_common(), columns=['Trigrams', 'Frequency'])
fdist_df3.head(n=15)

Unnamed: 0,Trigrams,Frequency
0,"(potentially, hazardous, food)",8667
1,"(shall, good, repair)",5739
2,"(exposed, hand, sink)",5471
3,"(smooth, easily, cleanable)",4619
4,"(three, compartment, sink)",4132
5,"(kept, clean, good)",4042
6,"(constructed, per, code)",3986
7,"(shall, constructed, per)",3983
8,"(floor, shall, constructed)",3982
9,"(per, code, smooth)",3981
