In [104]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation
from sklearn.utils import resample
from sklearn import svm

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
sns.set_style('whitegrid')


### Inspection Data

In [16]:
df = pd.read_pickle('data/inspection.pkl')

### Merged Data

In [17]:
mergeDF = pd.read_pickle('data/merge.pkl')

In [41]:
mergeDF.columns.tolist()

['zipcode',
 'score',
 'grade',
 'contamination',
 'facility',
 'food_handling',
 'hygiene',
 'temperature',
 'vermin',
 'inspection_year',
 'Afghan',
 'African',
 'American',
 'Armenian',
 'Asian',
 'Australian',
 'Bagels/Pretzels',
 'Bakery',
 'Bangladeshi',
 'Barbecue',
 'Bottle Beverages',
 'Brazilian',
 'Cajun',
 'Californian',
 'Caribbean',
 'Chicken',
 'Chilean',
 'Chinese',
 'Chinese/Cuban',
 'Chinese/Japanese',
 'Coffee',
 'Continental',
 'Creole',
 'Creole/Cajun',
 'Czech',
 'Delicatessen',
 'Donuts',
 'Eastern European',
 'Egyptian',
 'English',
 'Ethiopian',
 'Filipino',
 'French',
 'Fruits/Vegetables',
 'German',
 'Greek',
 'Hamburgers',
 'Hawaiian',
 'Hotdogs',
 'Hotdogs/Pretzels',
 'Ice Cream',
 'Indian',
 'Indonesian',
 'Iranian',
 'Irish',
 'Italian',
 'Japanese',
 'Jewish/Kosher',
 'Juice, Smoothies, Fruit Salads',
 'Korean',
 'Latin',
 'Mediterranean',
 'Mexican',
 'Middle Eastern',
 'Mixed Buffet',
 'Moroccan',
 'Not Listed/Not Applicable',
 'Nuts/Confectionary',
 '

In [42]:
mergeDF.drop('business_id',axis=1, inplace=True)

## Predict Health Grade for Inspections

### Dummy Prediction

In [25]:
df['dummy'] = 'A'

In [28]:
accuracy_score(df.grade,df.dummy)

0.86897293421760169

In [29]:
X = df.drop(['grade','score','dummy'],axis=1)
y = df.grade

In [30]:
rf = RandomForestClassifier()
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.3,random_state=1234)
rf.fit(X_train,y_train)
pred = rf.predict(X_test)
print(accuracy_score(y_test,pred))

0.82285287528


In [None]:
# importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
# importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

In [35]:
def important_features(model,X):
    feats = {} # a dict to hold feature_name: feature_importance
    for feature, importance in zip(X.columns, model.feature_importances_):
        feats[feature] = importance
    feats = [(k, feats[k]) for k in sorted(feats, key=feats.get, reverse=True)]
    return feats

In [36]:
important_features(rf,X)

[('zipcode', 0.56116937550418255),
 ('inspection_year', 0.084551034641104056),
 ('facility', 0.027286032117059704),
 ('vermin', 0.025707329792599133),
 ('brooklyn', 0.014027426762568618),
 ('manhattan', 0.01402212780502549),
 ('temperature', 0.013316693449029762),
 ('queens', 0.013110341553127135),
 ('contamination', 0.012955972270073413),
 ('Chinese', 0.010223492431279069),
 ('bronx', 0.0099761443506340766),
 ('hygiene', 0.0098124920004093764),
 ('American', 0.008918034388797666),
 ('Bakery', 0.0076797098145991449),
 ('Japanese', 0.0076659437608896817),
 ('Pizza', 0.0072804343363598988),
 ('Latin', 0.0071463620270211543),
 ('Mexican', 0.0071262477695392488),
 ('Italian', 0.006471467630227626),
 ('Spanish', 0.0064358118957950298),
 ('Pizza/Italian', 0.0062800829512630833),
 ('Caribbean', 0.0059189236393798854),
 ('food_handling', 0.0057417883911590577),
 ('Asian', 0.0056951033901271493),
 ('Indian', 0.005592624232414212),
 ('Chicken', 0.0053588763729674862),
 ('Delicatessen', 0.0050403

## Upsample Inspection Data

In [96]:
train_set = pd.concat((X_train,y_train),axis=1)

In [97]:
train_set.grade.value_counts()

A    13573
B     1710
C      338
Name: grade, dtype: int64

In [98]:
df_A = train_set[train_set.grade=='A']
df_B = train_set[train_set.grade=='B']
df_C = train_set[train_set.grade=='C']
upsample_B = resample(df_B,replace=True,n_samples=13573,random_state=123)
upsample_C = resample(df_C,replace=True,n_samples=13573,random_state=123)
upsampleDF = df_A.append(upsample_B)
upsampleDF = upsampleDF.append(upsample_C)

In [99]:
upsampleDF.grade.value_counts()

B    13573
C    13573
A    13573
Name: grade, dtype: int64

In [100]:
upX = upsampleDF.drop('grade',axis=1)
upY = upsampleDF.grade

In [102]:
rfUP2 = RandomForestClassifier()
rfUP2.fit(upX,upY)
pred4 = rfUP2.predict(X_test)
print(accuracy_score(y_test,pred4))
print(classification_report(y_test,pred4))

0.749663928305
             precision    recall  f1-score   support

          A       0.88      0.83      0.86      5819
          B       0.17      0.23      0.19       727
          C       0.03      0.05      0.04       149

avg / total       0.79      0.75      0.77      6695



## Predict Health Grade with Yelp Data

In [43]:
yelpX = mergeDF.drop(['grade','score'],axis=1)
yelpY = mergeDF.grade

In [47]:
rf2 = RandomForestClassifier()
X2_train,X2_test,y2_train,y2_test = train_test_split(yelpX,yelpY,test_size=.3,random_state=1234)
rf2.fit(X2_train,y2_train)
pred2 = rf2.predict(X2_test)
print(accuracy_score(y2_test,pred2))
print(classification_report(y2_test,pred2))

0.899862825789
             precision    recall  f1-score   support

          A       0.91      0.99      0.95      3187
          B       0.77      0.30      0.43       380
          C       0.78      0.23      0.36        78

avg / total       0.89      0.90      0.88      3645



In [60]:
unique_items, counts = np.unique(pred,return_counts=True)
zipped = zip(unique_items,counts)
print(list(zipped))

[('A', 6182), ('B', 450), ('C', 63)]


In [61]:
mergeDF.grade.value_counts().C/len(mergeDF)

0.021069958847736627

In [62]:
mergeDF.grade.value_counts().B/len(mergeDF)

0.10880658436213991

In [63]:
mergeDF.grade.value_counts().A/len(mergeDF)

0.87012345679012348

In [48]:
important_features(rf2,yelpX)

[('review_count', 0.23197454515558932),
 ('zipcode', 0.20260654215050772),
 ('rating', 0.097569923602276745),
 ('inspection_year', 0.059895435004619003),
 ('vermin', 0.025940614623702118),
 ('facility', 0.019688816785327412),
 ('brooklyn', 0.019138216783309993),
 ('American', 0.018174740332887917),
 ('queens', 0.018085624388118456),
 ('Chinese', 0.015627562239785821),
 ('$$', 0.014343056240216617),
 ('contamination', 0.014097279442702521),
 ('Mexican', 0.013848587869768161),
 ('Pizza', 0.013508617313725024),
 ('$', 0.012260523166750779),
 ('bronx', 0.01181722391996764),
 ('temperature', 0.01176563607936645),
 ('Latin', 0.0096391382951042052),
 ('Spanish', 0.0094725600610040556),
 ('hygiene', 0.0093165876755770937),
 ('Japanese', 0.0079015335381926524),
 ('Bakery', 0.0078118952468897692),
 ('Caribbean', 0.0072507254648407184),
 ('is_closed', 0.0071393140044337128),
 ('Asian', 0.0071005655634129696),
 ('Indian', 0.0062386816820634832),
 ('Seafood', 0.0062163372082347937),
 ('Italian', 0.

In [54]:
unique_items, counts = np.unique(pred2,return_counts=True)

In [55]:
zipped = zip(unique_items,counts)

In [58]:
print(list(zipped))

[('A', 3473), ('B', 149), ('C', 23)]


In [59]:
mergeDF.grade.value_counts()

A    10572
B     1322
C      256
Name: grade, dtype: int64

## Upsampling Yelp

In [105]:
train_set = pd.concat((X2_train,y2_train),axis=1)

In [106]:
train_set.grade.value_counts()

A    7385
B     942
C     178
Name: grade, dtype: int64

In [107]:
df_A = train_set[train_set.grade=='A']
df_B = train_set[train_set.grade=='B']
df_C = train_set[train_set.grade=='C']

In [108]:
upsample_B = resample(df_B,replace=True,n_samples=7385,random_state=123)
upsample_C = resample(df_C,replace=True,n_samples=7385,random_state=123)

In [109]:
upsampleDF = df_A.append(upsample_B)

In [110]:
upsampleDF = upsampleDF.append(upsample_C)

In [111]:
upsampleDF.grade.value_counts()

B    7385
C    7385
A    7385
Name: grade, dtype: int64

In [112]:
upX = upsampleDF.drop('grade',axis=1)
upY = upsampleDF.grade

In [113]:
rfUP = RandomForestClassifier()
rfUP.fit(upX,upY)
pred3 = rfUP.predict(X2_test)
print(accuracy_score(y2_test,pred3))
print(classification_report(y2_test,pred3))

0.891632373114
             precision    recall  f1-score   support

          A       0.92      0.97      0.94      3187
          B       0.59      0.38      0.46       380
          C       0.67      0.31      0.42        78

avg / total       0.88      0.89      0.88      3645



run the model on the test set and the train set

## SVM Test

In [115]:
model_svm = svm.SVC(kernel='rbf', gamma = 0.001)
model_svm.fit(upX, upY);
y_pred = model_svm.predict(X2_test)
print(accuracy_score(y2_test,y_pred))
print(classification_report(y2_test,y_pred))

0.430452674897
             precision    recall  f1-score   support

          A       0.91      0.43      0.59      3187
          B       0.13      0.41      0.20       380
          C       0.03      0.40      0.06        78

avg / total       0.81      0.43      0.53      3645

