## Part 1: Reading in and exploring the data

In [3]:
import pandas as pd

train = pd.read_json('data/train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


## Part 2: Feature engineering

In [4]:
# define a function that accepts a DataFrame and adds new features
def make_features(df):
    df['num_ingredients'] = df.ingredients.apply(len)
    df['ingredient_length'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    return df

In [5]:
# check that the function works
import numpy as np

train = make_features(pd.read_json('data/train.json'))
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1


In [6]:
# define a dictionary that maps each cuisine to its group number
cuisine_to_group = {}
cuisine_to_group['chinese'] = 1
cuisine_to_group['filipino'] = 1
cuisine_to_group['japanese'] = 1
cuisine_to_group['korean'] = 1
cuisine_to_group['thai'] = 1
cuisine_to_group['vietnamese'] = 1
cuisine_to_group['british'] = 2
cuisine_to_group['french'] = 2
cuisine_to_group['irish'] = 2
cuisine_to_group['russian'] = 2
cuisine_to_group['southern_us'] = 2
cuisine_to_group['greek'] = 3
cuisine_to_group['italian'] = 3
cuisine_to_group['moroccan'] = 3
cuisine_to_group['spanish'] = 3
cuisine_to_group['brazilian'] = 4
cuisine_to_group['cajun_creole'] = 4
cuisine_to_group['indian'] = 4
cuisine_to_group['jamaican'] = 4
cuisine_to_group['mexican'] = 4

In [7]:
train.loc[10, 'ingredients']

[u'pimentos',
 u'sweet pepper',
 u'dried oregano',
 u'olive oil',
 u'garlic',
 u'sharp cheddar cheese',
 u'pepper',
 u'swiss cheese',
 u'provolone cheese',
 u'canola oil',
 u'mushrooms',
 u'black olives',
 u'sausages']

In [8]:
# update make_features to create a new column 'ingredients_str'
def make_features(df):
    df['num_ingredients'] = df.ingredients.apply(len)
    df['ingredient_length'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    df['ingredients_str'] = df.ingredients.astype(str)
    return df

In [9]:
# run make_features and check that it worked
train = make_features(train)
train.loc[0, 'ingredients_str']

"[u'romaine lettuce', u'black olives', u'grape tomatoes', u'garlic', u'pepper', u'purple onion', u'seasoning', u'garbanzo beans', u'feta cheese crumbles']"

In [10]:
# map the cuisines to their group numbers
train['group'] = train.cuisine.map(cuisine_to_group)
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length,ingredients_str,group
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,"[u'romaine lettuce', u'black olives', u'grape ...",3
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,"[u'plain flour', u'ground pepper', u'salt', u'...",2
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,"[u'eggs', u'pepper', u'salt', u'mayonaise', u'...",1
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75,"[u'water', u'vegetable oil', u'wheat', u'salt']",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,"[u'black pepper', u'shallots', u'cornflour', u...",4


In [11]:
# check that all recipes were assigned a group
train.group.isnull().sum()

0

In [12]:
# define a "global" X and y, then define an X and y for each group
X = train.ingredients_str
y = train.group
X1 = train.loc[train.group==1, 'ingredients_str']
y1 = train.loc[train.group==1, 'cuisine']
X2 = train.loc[train.group==2, 'ingredients_str']
y2 = train.loc[train.group==2, 'cuisine']
X3 = train.loc[train.group==3, 'ingredients_str']
y3 = train.loc[train.group==3, 'cuisine']
X4 = train.loc[train.group==4, 'ingredients_str']
y4 = train.loc[train.group==4, 'cuisine']

In [16]:
# define a "global" pipeline, then define a pipeline for each group
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe_main = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_1 = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_2 = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_3 = make_pipeline(CountVectorizer(), MultinomialNB())
pipe_4 = make_pipeline(CountVectorizer(), MultinomialNB())

In [18]:
# calculate the cross-validated accuracy for each pipeline
from sklearn.cross_validation import cross_val_score

print(cross_val_score(pipe_main, X, y, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_1, X1, y1, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_2, X2, y2, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_3, X3, y3, cv=5, scoring='accuracy').mean())
print(cross_val_score(pipe_4, X4, y4, cv=5, scoring='accuracy').mean())

0.827651379606
0.769054289191
0.757672844551
0.869907044834
0.904089986908


In [19]:
# fit each pipeline with the relevant X and y
pipe_main.fit(X, y)
pipe_1.fit(X1, y1)
pipe_2.fit(X2, y2)
pipe_3.fit(X3, y3)
pipe_4.fit(X4, y4)

Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
# read in test.json and add the additional features
new = make_features(pd.read_json('data/test.json'))
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"[u'baking powder', u'eggs', u'all-purpose flou..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"[u'sugar', u'egg yolks', u'corn starch', u'cre..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"[u'sausage links', u'fennel bulb', u'fronds', ..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"[u'meat cuts', u'file powder', u'smoked sausag..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"[u'ground black pepper', u'salt', u'sausage ca..."


In [22]:
# run make_features on the new data
new = make_features(new)

In [23]:
# for new data, first make group predictions
X_new = new.ingredients_str
new_pred_group = pipe_main.predict(X_new)
new_pred_group

array([2, 2, 3, ..., 3, 4, 4], dtype=int64)

In [24]:
# then within each predicted group, make class predictions
new_pred_class_1 = pipe_1.predict(X_new[new_pred_group==1])
new_pred_class_2 = pipe_2.predict(X_new[new_pred_group==2])
new_pred_class_3 = pipe_3.predict(X_new[new_pred_group==3])
new_pred_class_4 = pipe_4.predict(X_new[new_pred_group==4])
print(new_pred_class_1)
print(new_pred_class_2)
print(new_pred_class_3)
print(new_pred_class_4)

[u'chinese' u'japanese' u'vietnamese' ..., u'chinese' u'chinese'
 u'vietnamese']
[u'british' u'southern_us' u'southern_us' ..., u'southern_us' u'french'
 u'french']
[u'spanish' u'italian' u'spanish' ..., u'italian' u'italian' u'italian']
[u'cajun_creole' u'mexican' u'indian' ..., u'mexican' u'cajun_creole'
 u'mexican']


In [25]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"[u'baking powder', u'eggs', u'all-purpose flou..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"[u'sugar', u'egg yolks', u'corn starch', u'cre..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"[u'sausage links', u'fennel bulb', u'fronds', ..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"[u'meat cuts', u'file powder', u'smoked sausag..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"[u'ground black pepper', u'salt', u'sausage ca..."


In [27]:
# add the class predictions to the DataFrame of new data
new.loc[new_pred_group==1, 'pred_class'] = new_pred_class_1
new.loc[new_pred_group==2, 'pred_class'] = new_pred_class_2
new.loc[new_pred_group==3, 'pred_class'] = new_pred_class_3
new.loc[new_pred_group==4, 'pred_class'] = new_pred_class_4

In [28]:
# create a submission file (score: 0.70475)
pd.DataFrame({'id':new.id, 'cuisine':new.pred_class}).set_index('id').to_csv('sub4.csv')

In [29]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str,pred_class
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"[u'baking powder', u'eggs', u'all-purpose flou...",british
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"[u'sugar', u'egg yolks', u'corn starch', u'cre...",southern_us
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"[u'sausage links', u'fennel bulb', u'fronds', ...",spanish
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"[u'meat cuts', u'file powder', u'smoked sausag...",cajun_creole
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"[u'ground black pepper', u'salt', u'sausage ca...",italian


In [30]:
from sklearn.grid_search import RandomizedSearchCV

In [31]:
# for any continuous parameters, specify a distribution instead of a list of options
import scipy as sp
param_grid = {}
param_grid['countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['countvectorizer__min_df'] = [1, 2, 3, 4]
param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
param_grid

{'countvectorizer__min_df': [1, 2, 3, 4],
 'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"],
 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen at 0x13bc77f0>}

In [53]:
# run 10 random searches
rand_main = RandomizedSearchCV(pipe_main, param_grid, cv=10, scoring='accuracy', n_iter=150, random_state=1)
rand_1 = RandomizedSearchCV(pipe_1, param_grid, cv=10, scoring='accuracy', n_iter=150, random_state=1)
rand_2 = RandomizedSearchCV(pipe_2, param_grid, cv=10, scoring='accuracy', n_iter=150, random_state=1)
rand_3 = RandomizedSearchCV(pipe_3, param_grid, cv=10, scoring='accuracy', n_iter=150, random_state=1)
rand_4 = RandomizedSearchCV(pipe_4, param_grid, cv=10, scoring='accuracy', n_iter=150, random_state=1)

In [56]:
%time rand_main.fit(X, y)
%time rand_1.fit(X1, y1)
%time rand_2.fit(X2, y2)
%time rand_3.fit(X3, y3)
%time rand_4.fit(X4, y4)

Wall time: 26min 9s
Wall time: 6min 12s
Wall time: 6min 46s
Wall time: 7min 50s
Wall time: 9min 31s


RandomizedSearchCV(cv=10, error_score='raise',
          estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=Non..., vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          fit_params={}, iid=True, n_iter=150, n_jobs=1,
          param_distributions={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"], 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000000013BC77F0>, 'countvectorizer__min_df': [1, 2, 3, 4]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          scoring='accuracy', verbose=0)

In [57]:
print(rand_main.best_score_)
#print(rand_1.best_params_)
print(rand_1.best_score_)
print(rand_2.best_score_)
print(rand_3.best_score_)
print(rand_4.best_score_)

0.845376376527
0.79577377253
0.786802599149
0.890972928024
0.926544240401


In [59]:
# for new data, first make group predictions
X_new = new.ingredients_str
new_pred_group = rand_main.predict(X_new)
new_pred_group

array([2, 2, 3, ..., 3, 4, 4], dtype=int64)

In [60]:
# then within each predicted group, make class predictions
new_pred_class_1 = rand_1.predict(X_new[new_pred_group==1])
new_pred_class_2 = rand_2.predict(X_new[new_pred_group==2])
new_pred_class_3 = rand_3.predict(X_new[new_pred_group==3])
new_pred_class_4 = rand_4.predict(X_new[new_pred_group==4])
print(new_pred_class_1)
print(new_pred_class_2)
print(new_pred_class_3)
print(new_pred_class_4)

[u'chinese' u'japanese' u'vietnamese' ..., u'thai' u'chinese' u'thai']
[u'british' u'southern_us' u'southern_us' ..., u'southern_us' u'french'
 u'french']
[u'italian' u'italian' u'spanish' ..., u'italian' u'italian' u'italian']
[u'cajun_creole' u'mexican' u'indian' ..., u'mexican' u'cajun_creole'
 u'mexican']


In [61]:
# add the class predictions to the DataFrame of new data
new.loc[new_pred_group==1, 'pred_class'] = new_pred_class_1
new.loc[new_pred_group==2, 'pred_class'] = new_pred_class_2
new.loc[new_pred_group==3, 'pred_class'] = new_pred_class_3
new.loc[new_pred_group==4, 'pred_class'] = new_pred_class_4

In [62]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str,pred_class
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"[u'baking powder', u'eggs', u'all-purpose flou...",british
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"[u'sugar', u'egg yolks', u'corn starch', u'cre...",southern_us
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"[u'sausage links', u'fennel bulb', u'fronds', ...",italian
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"[u'meat cuts', u'file powder', u'smoked sausag...",cajun_creole
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"[u'ground black pepper', u'salt', u'sausage ca...",italian


In [63]:
# create a submission file
pd.DataFrame({'id':new.id, 'cuisine':new.pred_class}).set_index('id').to_csv('whats_cooking_st.csv')