In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor
from sklearn.metrics import roc_auc_score, r2_score, mean_absolute_error, mean_squared_error
import seaborn as sns
import re
import string
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier



Read in Data

In [2]:
df = pd.read_json("recipies.json")

Define Domain Specific Stop Words:

In [3]:
my_stops = [ 'vegetable_oil','all_purpose_flour', 'butter', 'green_onion', 'purple_onion', 'salt', 'chili_powder',
 'red_bell_pepper', 'extra_virgin_olive_oil', 'ginger', 'black_pepper', 'milk', 'oil', 'all',
 'egg', 'scallion', 'grated_parmesan_cheese', 'corn_starch', 'olive_oil', 'water', 'unsalted_butter',
 'soy_sauce', 'baking_powder', 'garlic', 'pepper', 'kosher_salt', 'carrot', 'cinnamon', 'extra', 'sugar', 'onion', 'garlic_clove', 'tomatoe']

Define Helper functions

In [4]:
def remove_dashes(listofstrings):
    return [x.replace("-", " ") for x in listofstrings]

def describe_cat(X):
    """function to show categorical variables of a dataframe"""
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes=="object"]].describe().to_html()))
def connect_strings(listofstrings):
    return [x.replace(" ", "_") for x in listofstrings]

def describe_cat(X):
    """function to show categorical variables of a dataframe"""
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes=="object"]].describe().to_html()))

def connect_list(listofstrings):
    return " ".join(listofstrings)

def clean_words(listofstrings):
    listofstrings = [x[:-len('s')] if x.endswith('s') else x for x in listofstrings]
    words = [x[len('ground '):] if x.startswith('ground') else x for x in listofstrings]
    words = ['egg' if 'egg' in x else x for x in words]
    return words



Clean Data:
    1. Remove Dashes
    2. Equate similar words (ground cinnamon = cinnamon)
    3. Connect single ingredients with "_"
    4. Connect list of ingredients into single string

In [5]:

df['ingredients'] = df['ingredients'].apply(remove_dashes)

df['ingredients'] = df['ingredients'].apply(clean_words)

df['ingredients'] = df['ingredients'].apply(connect_strings)

df['ingredients'] = df['ingredients'].apply(connect_list)


df_y = df['cuisine']
df_x= df['ingredients']

Get class with smallest number of entries and ballance the data set (each class has same number of entries)

In [6]:
counts = df['cuisine'].value_counts()
cuisine_types = counts.index

size = counts.min()-2       # sample size
replace = False  # with replacement
fn = lambda obj: obj.loc[np.random.choice(obj.index, size, replace),:]
ballanced = df.groupby('cuisine', as_index=False).apply(fn)

Train test split

In [7]:
dfY=ballanced['cuisine']
dfX=ballanced['ingredients']

x_train, x_test, y_train, y_test = train_test_split(dfX,dfY,test_size=0.2,random_state=42)

Vectorize the words using tf idf

In [8]:
tfv2 = TfidfVectorizer(min_df=1,stop_words = my_stops)
x_train_tfv2 = tfv2.fit_transform(x_train.values)
x_test_tfv2 = tfv2.transform(x_test.values)


Label encode target variable classes

In [9]:
le = preprocessing.LabelEncoder()
le.fit(cuisine_types)
y_train_en =  le.transform(y_train) 
y_test_en =  le.transform(y_test) 


Run Random Forest and get accuracy scores

In [10]:
clf = RandomForestClassifier(n_estimators=2000, max_depth=16,min_samples_split=4,min_samples_leaf=4,
                              random_state=0)
clf.fit(x_train_tfv2, y_train_en)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=16, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=4, min_samples_split=4,
            min_weight_fraction_leaf=0.0, n_estimators=2000, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
RF_pred_train = clf.predict(x_train_tfv2)
RF_pred_test = clf.predict(x_test_tfv2)

In [12]:
accuracy_score(y_train_en,RF_pred_train)
#TRAINING ACCURACY

0.5448924731182796

In [13]:
accuracy_score(y_test_en,RF_pred_test)
#TESTING ACCURACY

0.5016129032258064

Run Gaussian Naive Bayes and get accuracy scores

In [14]:
gnb = GaussianNB(var_smoothing=1.)
gbmod= gnb.fit(x_train_tfv2.toarray(), y_train_en)
nbpred = gbmod.predict(x_train_tfv2.toarray())
nbpredT = gbmod.predict(x_test_tfv2.toarray())

In [15]:
NB_pred_train = gbmod.predict(x_train_tfv2.toarray())
accuracy_score(y_train_en,NB_pred_train)
#TRAINING ACCURACY:

0.8133064516129033

In [21]:
NB_pred_test = gbmod.predict(x_test_tfv2.toarray())
accuracy_score(y_test_en,NB_pred_test)
#TESTING ACCURACY

0.6435483870967742

List out most important words for each cuisine

In [20]:
words2 = tfv2.get_feature_names()
cuslist = le.inverse_transform(range(len(cuisine_types)))
for ine in range(len(cuisine_types)):
    cuis_max = np.argpartition(gbmod.theta_[ine], -10)[-10:]
    most_imp = [words2[ii] for ii in cuis_max]
    print(cuslist[ine], " : ", most_imp)

brazilian  :  ['lime_juice', 'chocolate_sprinkle', 'ice', 'ice_cube', 'tapioca_flour', 'black_bean', 'sweetened_condensed_milk', 'lime', 'cachaca', 'coconut_milk']
british  :  ['nutmeg', 'self_rising_flour', 'potatoe', 'worcestershire_sauce', 'baking_soda', 'heavy_cream', 'vanilla_extract', 'flour', 'white_sugar', 'whole_milk']
cajun_creole  :  ['diced_tomatoe', 'hot_sauce', 'bay_leave', 'celery_rib', 'andouille_sausage', 'cayenne_pepper', 'creole_seasoning', 'cajun_seasoning', 'celery', 'green_bell_pepper']
chinese  :  ['light_soy_sauce', 'peanut_oil', 'peeled_fresh_ginger', 'fresh_ginger', 'hoisin_sauce', 'oyster_sauce', 'rice_vinegar', 'dark_soy_sauce', 'white_pepper', 'sesame_oil']
filipino  :  ['evaporated_milk', 'pork_belly', 'pork', 'vinegar', 'cooking_oil', 'coconut_milk', 'peppercorn', 'fish_sauce', 'bay_leave', 'brown_sugar']
french  :  ['leek', 'dijon_mustard', 'frozen_pastry_puff_sheet', 'fresh_lemon_juice', 'shallot', 'whipping_cream', 'heavy_cream', 'whole_milk', 'vanilla

In [None]:
pred_proba = NB_optimal.predict_proba(X_test)
words = np.take(count_vect.get_feature_names(), pred_proba.argmax(axis=1))

In [None]:
from sklearn.naive_bayes import BernoulliNB
optimal_alpha=1
NB_optimal = BernoulliNB(alpha=optimal_alpha)

# fitting the model
NB_optimal.fit(x_train_tfv2.toarray(),y_train_en)

# predict the response
pred = NB_optimal.predict(x_train_tfv2.toarray())

# evaluate accuracy
acc = accuracy_score(y_train_en,pred)
print('\nThe accuracy of the NB classifier for k = %d is %f%%' % (optimal_alpha, acc))

In [None]:
predtest = NB_optimal.predict(x_test_tfv2.toarray())

# evaluate accuracy
acc = accuracy_score(y_test_en,predtest)

In [None]:
acc

In [None]:
df['ingredients'].iloc[0]

In [None]:
sc.score(x_train_tfv2.toarray(),y_train_en)


In [None]:
sc.score(x_test_tfv2.toarray(),y_test_en)

In [None]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *