In [2]:
import keras
keras.__version__

Using TensorFlow backend.


'2.2.4'

In [25]:
import re
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

# Start building the deep learning model
from keras import models
from keras import layers
from keras.layers import Dense, Dropout
from keras import metrics
from keras import losses
from keras import optimizers
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping,ModelCheckpoint

In [4]:
df_test = pd.read_json("test.json")
df_train = pd.read_json("train.json")

In [5]:
df_train.head()

Unnamed: 0,cuisine,id,ingredients
0,italian,0,"[penne, shallots, rice vinegar, fresh basil, g..."
1,greek,1,"[sugar, chopped walnuts, filo dough, chopped a..."
2,italian,2,"[fresh rosemary, chopped fresh thyme, fresh or..."
3,southern_us,3,"[sugar, hot sauce, ramps, vinegar, cream chees..."
4,french,4,"[ground cinnamon, panettone, whipped cream, Am..."


In [6]:
df_train.shape

(29774, 3)

In [40]:
cuisines = df_train.groupby(["cuisine"]).id.count()
cuisines.sort_values(ascending = False)

cuisine
italian         5867
mexican         4819
southern_us     3234
indian          2248
chinese         2001
french          1981
cajun_creole    1157
thai            1152
japanese        1065
greek            880
spanish          740
korean           621
vietnamese       618
moroccan         615
british          602
filipino         565
irish            499
jamaican         394
russian          366
brazilian        350
Name: id, dtype: int64

In [8]:
ingredients = []
for x in range(len(df_train.ingredients)):
    ingredients.extend(df_train.ingredients[x])

ingredients = np.array(ingredients)

In [9]:
ingredients = np.unique(ingredients)
ingredients.size

6199

In [10]:
ingredients

array(['(    oz.) tomato sauce', '(   oz.) tomato paste',
       '(10 oz.) frozen chopped spinach', ..., 'ziti', 'zucchini',
       'zucchini blossoms'], dtype='<U74')

In [11]:
character_exceptions_to_remove = ["\xae", "\xe8vre", "\u2122"]
phrase_exceptions = [("7 up", "sevenup"), ("sun dried", "sundried"), ("bone less", "boneless"), 
                     ("skin less", "skinless")]
map_plural_to_singular = [("steaks", "steak"), ("loins", "loin"), ("inches", "inch"), ("centimeters", "centimeter"),
                          ("ounces", "ounce"), ("liters", "liter"), ("mililiters", "mililiter"), ("grams", "gram"),
                          ("cups", "cup"), ("gallons", "gallon"), ("quarts", "quart"), ("lbs", "lb"),
                          ("pounds", "pound"), ("tablespoons", "tablespoon"), ("teaspoons", "teaspoon"), 
                          ("pints", "pint"), ("fluid ounces", "fluid ounce"), ("onions", "onion"), 
                          ("cloves", "clove"), ("bulbs", "bulb"), ("peppers", "pepper"), ("breasts", "breast"),
                          ("eggs", "egg"), ("carrots", "carrot"), ("mushrooms", "mushroom"),
                          ("tortillas", "tortilla"), ("sausages", "sausage"), ("wedges", "wedge"), 
                          ("tomatoes", "tomato"), ("thighs", "thigh"), ("chilies", "chili"), ("potatoes", "potato"), 
                          ("peppercorns", "peppercorn"), ("spices", "spice"), ("chiles", "chile"), ("apples", "apple"),
                          ("legs", "leg"), ("doughs", "dough"), ("drumsticks", "drumstick")]
brandnames_to_remove = ["alexia", "breakstones", "kraft", "bertolli classico", "bertolli", "best foods", 
                        "betty crocker", "bisquick", "bob evans", "breyers", "curry guy", "camellia", "campbells", 
                        "country crock", "crisco", "crystal farms", "delallo", "diamond crystal", "domino", 
                        "doritos", "earth balance", "egglands best", "foster farms", "franks", "gold medal", 
                        "goya", "green giant steamers niblets", "green giant", "heinz", "hellmanns", "herdez", 
                        "hidden valley", "honeysuckle white", "jacksonville",  "jimmy dean", "johnsonville", 
                        "knorr", "krudsen", "kikkoman", "lipton", "land o lakes", "mazola", "lea and perrins", 
                        "mccormick", "meyer", "mission", "old el paso", "old bay", "pam", "pepperidge farm", 
                        "oscar mayer", "pace", "pillsbury", "progresso", "pure wesson", "pompeian", "san marzano", 
                        "sargento", "soy vay", "taco bell", "yoplait", "spice islands", "stonefire", "success", 
                        "swanson", "truvía", "uncle bens", "wish bone", "zatarains", "morton", "jameson", "tapatio", 
                        "mountain high", "philadelphia", "king arthur", "roma"]
keywords_to_remove = ["lowfat", "light", "shredded", "sliced", "all purpose", "all natural", "natural", "original", 
                      "gourmet", "traditional", "boneless", "skinless", "fresh", "nonfat", "pitted", "quick cooking", 
                      "unbleached", "part skim", "skim", "quickcooking", "oven ready", "homemade", "instant", "small", 
                      "extra large", "large", "chopped", "grated", "cooked", "stone ground", "freshly ground", 
                      "ground", "pure", "peeled", "deveined", "organic", "cracked", "granulated", "inch thick", 
                      "extra firm", "crushed", "flakes", "self rising", "diced", "crumbles", "crumbled", 
                      "whole wheat", "whole grain", "baby", "medium", "plain", "of", "thick cut", "cubed", "coarse", 
                      "free range", "seasoned", "canned", "multipurpose", "vegan", "thawed", "squeezed", 
                      "vegetarian", "fine", "zesty", "halves", "firmly packed", "drain", "drained", "washed"]
measurements_to_remove = ["in", "inch", "cm", "centimeter", "oz", "ounce", "l", "liter", "ml", "mililiter", "g", 
                          "gram", "cup", "gallon", "quart", "lb", "pound", "tbsp", "tablespoon", "tsp", "teaspoon", 
                          "pint", "fl oz", "fluid ounce"]
phrases_to_remove = measurements_to_remove + keywords_to_remove + brandnames_to_remove
phrases_to_map = [
    (("green onion", "red onion", "purple onion", "yellow onion", "yel onion"), "onion"),
    (("collard green leaves", "collards", "collard leaves"), "collard greens"),
    ("black pepper", "pepper"),
    ("yel chives", "chives"),
    ("spinach leaves", "spinach"),
    ("tea leaves", "tea"),
    ("chile", "chili"),
    (("garlic clove", "garlic bulb"), "garlic"),
    ("uncooked", "raw"),
    (("red chili pepper", "hot chili pepper", "red hot chili pepper"), "chili pepper"),
    (("baking potato", "baked potato"), "baked potato"),
    (("sea salt", "kosher salt", "table salt", "white salt"), "salt"),
    ("scotch whiskey", "scotch"),
    (("i cant believe its not butter spread", "i cant believe its not butter"), "butter"),
    (("extra virgin olive oil", "virgin olive oil", "mild olive oil"), "olive oil"),
    (("white bread", "wheat bread", "grain bread"), "bread"),
    (("white sugar", "yel sugar"), "sugar"),
    ("confectioners sugar", "powdered sugar"),
    (("extra virgin coconut oil", "virgn coconut oil"), "coconut oil")
]

# When executing multiple regex parses, it's most efficient to compile the expression ahead of time.
punctuation_to_replace_with_space = re.compile(r"[-,]")
percentage_less_to_remove = re.compile(r"[\d+]% less [A-z]*")
percentage_reduced_to_remove = re.compile(r"[\d+]% reduced [A-z]*")
symbols_to_remove = re.compile(r"[!\\/%.'®™]")
digits_to_remove = re.compile(r"\d+")
symbols_to_replace_with_and = re.compile(r"[&+]")
parentheses_content_to_remove = re.compile(r"\([^)]*\)")
no_blank_added_to_remove = re.compile(r"no [A-z]* added")
reduced_and_following_word_to_remove = re.compile(r"reduced [A-z]*")
low_and_following_word_to_remove = re.compile(r"low [A-z]*")
less_and_following_word_to_remove = re.compile(r"less [A-z]*")
non_and_following_word_to_remove = re.compile(r"non [A-z]*")
nonfat_removal = re.compile(r"nonfat*")
nonhydrogenated_removal = re.compile(r"nonhydrogenated*")
nondairy_removal = re.compile(r"nondairy")
free_and_previous_word_to_remove = re.compile(r"[A-z]* free")
multiple_spaces_to_trim = re.compile(r" +")



In [12]:
# noise remove functions
def remove_noise_ingredient(ingredient):
     # Convert to lowercase.
    ingredient = ingredient.lower()
    
    # Replace hyphens and commas with spaces.
    ingredient = punctuation_to_replace_with_space.sub(" ", ingredient)
    
    # Map certain exceptions that we don't want whiped out by later cleaning processes.
    for character_exception in character_exceptions_to_remove:
        ingredient = re.sub(character_exception, "", ingredient)
    for phrase_exception, replacement in phrase_exceptions:
        ingredient = re.sub(r"\b{}\b".format(phrase_exception), replacement, ingredient)
        
    # Remove "percentage less" instances (e.g., "40% less sodium") - do this before removing % and digits.
    ingredient = percentage_less_to_remove.sub("", ingredient)
    
    # Remove "percentage reduced" instances (e.g., "50% reduced fat") - do this before removing % and digits.
    ingredient = percentage_reduced_to_remove.sub("", ingredient)
    
    # Remove various unwanted symbols.
    ingredient = symbols_to_remove.sub("", ingredient)
    
    # Remove digits.
    ingredient = digits_to_remove.sub("", ingredient)
    
    # Replace common symbols with their word equivalent (so "Ben&Jerry's" == "Ben and Jerry's", etc.).
    ingredient = symbols_to_replace_with_and.sub(" and ", ingredient)
    
    # Remove anything between parentheses (this mainly includes volume measurements or 
    # unnecessary cooking instructions).
    ingredient = parentheses_content_to_remove.sub(" ", ingredient)
    
    # Remove all instances of "no [sugar, sodium, fat, whatever] added".
    ingredient = no_blank_added_to_remove.sub("", ingredient)
    
    # Remove any instance of "reduced" and the word after it (e.g., 'reduced sodium').
    ingredient = reduced_and_following_word_to_remove.sub("", ingredient)
    
    # Remove any instance of "low" and the word after it (e.g., 'low fat', 'low sodium').
    ingredient = low_and_following_word_to_remove.sub("", ingredient)
    
    # Remove any instance of "less" and the word after it (e.g., 'less sodium').
    ingredient = less_and_following_word_to_remove.sub("", ingredient)
    
    # Remove any instance of "free" and the word before it (e.g., 'sugar free', 'sodium free')
    ingredient = free_and_previous_word_to_remove.sub("", ingredient)
    
    # Remove any instance of "non" and the word after or "nonfat"/"nonhydrogenated"/"nondairy"
    ingredient = non_and_following_word_to_remove.sub("", ingredient)
    ingredient = nonfat_removal.sub("", ingredient)
    ingredient = nonhydrogenated_removal.sub("", ingredient)
    ingredient = nondairy_removal.sub("", ingredient)
    
    # Remove excess spacing in between words after first cleaning pass.
    ingredient = multiple_spaces_to_trim.sub(" ", ingredient)
    
    # Map common plural ingredients to singular ingredients
    for plural, singular in map_plural_to_singular:
        ingredient = re.sub(r"\b{}\b".format(plural), singular, ingredient)
        
    # Remove unuseful words
    for phrase in phrases_to_remove:
        ingredient = re.sub(r"\b{}\b".format(phrase), "", ingredient)
    
    # Map several similar phrases to the other equivalents to maintain consistency.
    for pattern, replacement in phrases_to_map:
        if ingredient in pattern:
            ingredient = replacement
    
    # Remove excess spacing in between words after second cleaning pass and leading/trailing whitespace.
    ingredient = multiple_spaces_to_trim.sub(" ", ingredient)
    ingredient = ingredient.strip()
    
    return ingredient


def remove_noise_recipe(recipe):
    recipe = list(map(remove_noise_ingredient, recipe))
    
    recipe = list(filter(lambda x: len(x) > 0, recipe))
    
    return recipe

def remove_noise_data(raw_data):
    clean_data = list(map(remove_noise_recipe, raw_data))
    
    return clean_data

In [13]:
start_time = time.time()
df_train.ingredients = remove_noise_data(df_train["ingredients"])
time_taken = time.time() - start_time
print("Cleaning took %f", time_taken)

Cleaning took %f 150.67625093460083


In [14]:
# Saving this, it took forever.
df_train.to_csv("df_train.csv", index = False)

In [15]:
cleaned_ingredients = []
for li in df_train.ingredients:
    cleaned_ingredients.extend(li)

cleaned_ingredients = np.array(cleaned_ingredients)
cleaned_ingredients = np.unique(cleaned_ingredients)

In [16]:
cleaned_ingredients

array(['a taste thai rice noodles', 'abalone', 'abbamele', ..., 'ziti',
       'zucchini', 'zucchini blossoms'], dtype='<U63')

In [17]:
t=Tokenizer()
t.fit_on_texts(df_train['ingredients'])
train_encoded=t.texts_to_matrix(df_train['ingredients'],mode='tfidf')

In [18]:
#Clean test ingredients
start_time = time.time()
df_test.ingredients = remove_noise_data(df_test.ingredients)
time_taken = time.time() - start_time
print("Cleaning took %f", time_taken)

Cleaning took %f 50.260130405426025


In [23]:
df_test.head()

Unnamed: 0,id,ingredients
0,29774,"[egg, beef stock, rice cakes]"
1,29775,"[pasta, orange, thyme, peas, celery, tomato pa..."
2,29776,"[olive oil, onion, red wine, top sirloin steak..."
3,29777,"[black pepper, patis, chicken stock, garlic, o..."
4,29778,"[pepper, garlic, tomato paste, salt, olive oil..."


In [19]:
test_encoded=t.texts_to_matrix(df_test['ingredients'],mode='tfidf')
test_encoded.shape

(10000, 5228)

In [20]:
train_encoded.shape

(29774, 5228)

In [48]:
label2index={cuisine:i for i,cuisine in enumerate(cuisines.index)}
label2index

{'brazilian': 0,
 'british': 1,
 'cajun_creole': 2,
 'chinese': 3,
 'filipino': 4,
 'french': 5,
 'greek': 6,
 'indian': 7,
 'irish': 8,
 'italian': 9,
 'jamaican': 10,
 'japanese': 11,
 'korean': 12,
 'mexican': 13,
 'moroccan': 14,
 'russian': 15,
 'southern_us': 16,
 'spanish': 17,
 'thai': 18,
 'vietnamese': 19}

In [71]:
y = []

for item in df_train.cuisine:
    if item in label2index.keys():
        y.append(label2index[item])
y_encoded=to_categorical(y,20)

In [72]:
y_encoded.shape

(29774, 20)

In [182]:
X_train,X_val,y_train,y_val=train_test_split(train_encoded,y_encoded,test_size=0.25,random_state=22)

In [183]:
#partial_x_train, X_test, partial_y_train, y_test = train_test_split(X_train, y_train, test_size = 0.25)

In [184]:
model = models.Sequential()
model.add(layers.Dense(1000, activation='relu', input_shape=(5228,)))
model.add(Dropout(0.75, name='dropout_1'))
model.add(layers.Dense(100, activation='relu', input_shape=(5228,)))
model.add(Dropout(0.8, name='dropout_2'))
#model.add(layers.Dense(64, activation='relu'))
#model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(20, activation='softmax'))

In [185]:
train_encoded.shape

(29774, 5228)

In [186]:
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [187]:
monitor=[
    EarlyStopping(monitor='val_loss',patience=5,verbose=1),
    ModelCheckpoint('best-model-0.h5',monitor='val_loss',save_best_only=True,save_weights_only=True)
]

model.fit(X_train,y_train,
         validation_data=(X_val,y_val),
         epochs=100,
         callbacks=monitor,
         batch_size=256)

Train on 22330 samples, validate on 7444 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 00018: early stopping


<keras.callbacks.History at 0x262dedcf860>

In [188]:
y_pred = model.predict(test_encoded)
y_pred.shape

(10000, 20)

In [189]:
result_encoded = []
for item in y_pred:
    result_encoded.append(item.argmax())
    
result_encoded[:10]

[12, 9, 9, 4, 9, 2, 9, 18, 3, 16]

In [190]:
results=[]

for i in result_encoded:
    for k,v in label2index.items():
        if v==i:
            results.append(k)
        

In [191]:
results[:10]

['korean',
 'italian',
 'italian',
 'filipino',
 'italian',
 'cajun_creole',
 'italian',
 'thai',
 'chinese',
 'southern_us']

In [192]:
submission=pd.DataFrame(list(zip(df_test['id'],results)),columns=['id','cuisine'])
submission.to_csv('submission.csv',header=True,index=False)

In [193]:
submission=pd.read_csv('submission.csv')
submission.head()

Unnamed: 0,id,cuisine
0,29774,korean
1,29775,italian
2,29776,italian
3,29777,filipino
4,29778,italian


In [175]:
from sklearn.metrics import accuracy_score

In [176]:
y_test_encoded = []
for item in y_test:
    y_test_encoded.append(item.argmax())
    
y_test_encoded[:10]

[13, 2, 3, 18, 9, 9, 18, 16, 9, 13]

In [177]:
y_test_check = []

for i in y_test_encoded:
    for k,v in label2index.items():
        if v==i:
            y_test_check.append(k)

In [178]:
y_pred_test = model.predict(X_test)

In [179]:
y_pred_test_encoded = []
for item in y_pred_test:
    y_pred_test_encoded.append(item.argmax())
    
y_pred_test_encoded[:10]

[13, 2, 3, 19, 5, 5, 18, 16, 9, 9]

In [180]:
y_pred_test_check = []

for i in y_pred_test_encoded:
    for k,v in label2index.items():
        if v==i:
            y_pred_test_check.append(k)

y_pred_test_check[:10]

['mexican',
 'cajun_creole',
 'chinese',
 'vietnamese',
 'french',
 'french',
 'thai',
 'southern_us',
 'italian',
 'italian']

In [181]:
accuracy_score(y_pred_test_check, y_test_check)

0.7753895754970446