In [1]:
import pandas

In [2]:
import sklearn
from sklearn import tree
from sklearn import metrics
from sklearn import cluster
from sklearn import feature_extraction
from sklearn import model_selection

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
%matplotlib inline

In [4]:
BOSTON_DATA = "Food_Establishment_Inspections_(converted).csv"

In [5]:
CHICAGO_DATA = "Food_Inspections_(converted).csv"

In [6]:
vec = DictVectorizer()

In [7]:
# load boston data
boston_df = pandas.read_csv(BOSTON_DATA)

In [8]:
# load chicago data
chicago_df = pandas.read_csv(CHICAGO_DATA)

In [None]:
# take a random sample of 100k records
boston_df2 = boston_df.sample(100000)

# map 'Fail' to 0 and other (generally 'Pass') to 1
boston_result = boston_df2['result'].map(lambda x: 0 if x == "Fail" else 1)

# get a subset of fields for machine learning, make sure none of the field values are 'NaN'
boston_subset = boston_df2[["businessName", "violation", "risk", "city"]]
boston_subset = boston_subset.replace(pandas.np.nan,' ', regex=True)

# convert the dataframe into an array of dictionarys and generate feature matrix
boston_dict = boston_subset.to_dict('records')
boston_fm = vec.fit_transform(boston_dict).toarray()

# load back into a dataframe
df2 = pandas.DataFrame(boston_fm)

# split data into training and test datasets
boston_fm_train, boston_fm_test, target_train, target_test = train_test_split(df2, boston_result, test_size = 0.2)

In [None]:
boston_model = DecisionTreeClassifier()

# generate decision tree from boston training dataset
%time boston_model.fit(boston_fm_train, target_train)

In [None]:
# 
train_predicted = boston_model.predict(boston_fm_train)
print(metrics.classification_report(target_train, train_predicted))
print(metrics.confusion_matrix(target_train, train_predicted))

In [None]:
# generate model evaluation metrics
test_predicted = boston_model.predict(boston_fm_test)
print(metrics.classification_report(target_test, test_predicted))
print(metrics.confusion_matrix(target_test, test_predicted))

In [None]:
# convert violation values to string
chicago_df['violation'] = chicago_df['violation'].astype('str')

# take a random sample of 100k records
chicago_df2 = chicago_df.sample(100000)

# map 'Fail' to 0 and other (generally 'Pass') to 1
chicago_result = chicago_df2['result'].map(lambda x: 0 if x == "Fail" else 1)

# get a subset of fields for machine learning, make sure none of the field values are 'NaN'
chicago_subset = chicago_df2[["businessName", "violation", "risk", "city"]]
chicago_subset = chicago_subset.replace(pandas.np.nan,' ', regex=True)

# convert the dataframe into an array of dictionarys and generate feature matrix
chicago_dict = chicago_subset.to_dict('records')
chicago_fm = vec.fit_transform(chicago_dict).toarray()

# load back into a dataframe
chicago_df3 = pandas.DataFrame(chicago_fm)

# split data into training and test datasets
chicago_fm_train, chicago_fm_test, chicago_target_train, chicago_target_test = train_test_split(chicago_df3, chicago_result, test_size = 0.2)

In [None]:
chicago_model = DecisionTreeClassifier()

# generate decision tree from chicago training dataset
%time chicago_model.fit(chicago_fm_train, chicago_target_train)

In [None]:
# 
chi_train_predicted = chicago_model.predict(chicago_fm_train)
print(metrics.classification_report(chicago_target_train, chi_train_predicted))
print(metrics.confusion_matrix(chicago_target_train, chi_train_predicted))

In [None]:
# generate model evaluation metrics
chi_test_predicted = chicago_model.predict(chicago_fm_test)
print(metrics.classification_report(chicago_target_test, chi_test_predicted))
print(metrics.confusion_matrix(chicago_target_test, chi_test_predicted))

In [9]:
chicago_df['violation'] = chicago_df['violation'].astype('str')

In [10]:
combined_df = boston_df + chicago_df

In [11]:
# take a random sample of 100k records
combined_df2 = combined_df.sample(50000)

# map 'Fail' to 0 and other (generally 'Pass') to 1
combined_result = combined_df2['result'].map(lambda x: 0 if x == "Fail" else 1)

# get a subset of fields for machine learning, make sure none of the field values are 'NaN'
combined_subset = combined_df2[["businessName", "violation", "risk", "city"]]
combined_subset = combined_subset.replace(pandas.np.nan,' ', regex=True)

# convert the dataframe into an array of dictionarys and generate feature matrix
combined_dict = combined_subset.to_dict('records')
combined_fm = vec.fit_transform(combined_dict).toarray()

# split data into training and test datasets
combined_fm_train, combined_fm_test, combined_target_train, combined_target_test = train_test_split(combined_fm, combined_result, test_size = 0.2)

In [12]:
combined_model = DecisionTreeClassifier()

# generate decision tree from chicago training dataset
%time combined_model.fit(combined_fm_train, combined_target_train)

CPU times: user 5.38 s, sys: 10.9 s, total: 16.2 s
Wall time: 20.8 s


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [13]:
combined_train_predicted = combined_model.predict(combined_fm_train)
print(metrics.classification_report(combined_target_train, combined_train_predicted))
print(metrics.confusion_matrix(combined_target_train, combined_train_predicted))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00     40000

avg / total       1.00      1.00      1.00     40000

[[40000]]


In [14]:
# generate model evaluation metrics
combined_test_predicted = combined_model.predict(combined_fm_test)
print(metrics.classification_report(combined_target_test, combined_test_predicted))
print(metrics.confusion_matrix(combined_target_test, combined_test_predicted))

             precision    recall  f1-score   support

          1       1.00      1.00      1.00     10000

avg / total       1.00      1.00      1.00     10000

[[10000]]
