In [53]:
%pylab inline
from sklearn import \
        datasets, \
        linear_model, \
        cross_validation, \
        naive_bayes, \
        preprocessing, \
        svm

import pandas as pd
import matplotlib.pyplot as plt
import json
from pprint import pprint
from sklearn.cross_validation import KFold

Populating the interactive namespace from numpy and matplotlib


In [14]:
df_train = pd.read_json('train.json')
df_test = pd.read_json('test.json')
df_test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [19]:
print "number of dishes: " + str(len(df_train))

number of dishes: 39774


In [30]:
list_ingredients = list([])

for d in df_train['ingredients']:
    s = ','.join(d).strip()
    for ss in s.split(','):
        if ss not in list_ingredients:
            list_ingredients.append(ss)

print "number of unique ingredients: " + str(len(list_ingredients))

number of unique ingredients: 6725


In [24]:
list_ingredients.sort()

print list_ingredients

[u'', u' 1 inch thick', u' cook and drain', u' cooked and drained', u' crisp-cooked and crumbled', u' crush', u' cut french into loaf', u' cut into', u' cut into italian loaf', u' cut into serving pieces', u' drain', u' drain and flake', u' drained and chopped', u' fine chop', u' peel and devein', u' rins and drain', u' rins and pat dry', u' skinless chicken breast', u' slice', u' sliced', u' soften', u' split and toasted', u' thaw', u' thaw and drain', u' thawed and squeezed dry', u' thawed and undiluted', u' undrain', u' undrain and chop', u' well scrub', u' well scrubbed', u' wine', u'(    oz.) tomato sauce', u'(   oz.) tomato paste', u'(10 oz.) frozen chopped spinach', u'(14 oz.) sweetened condensed milk', u'(14.5 oz.) diced tomatoes', u'(15 oz.) refried beans', u'1% low-fat buttermilk', u'1% low-fat chocolate milk', u'1% low-fat cottage cheese', u'1% low-fat milk', u'2 1/2 to 3 lb. chicken', u'2% low fat cheddar chees', u'2% low-fat cottage cheese', u'2% lowfat greek yogurt', u'2%

In [35]:
list_cuisines = list([])

for c in df_train['cuisine']:
    if c not in list_cuisines:
        list_cuisines.append(c)

print "number of cuisines: " + str(len(list_cuisines))

number of cuisines: 20


In [36]:
print list_cuisines

[u'greek', u'southern_us', u'filipino', u'indian', u'jamaican', u'spanish', u'italian', u'mexican', u'chinese', u'british', u'thai', u'vietnamese', u'cajun_creole', u'brazilian', u'french', u'japanese', u'irish', u'korean', u'moroccan', u'russian']


In [47]:
# Feature Matrix - Training Data

X1 = np.zeros((len(df_train), len(list_ingredients)))
Y1 = np.zeros((len(df_train)))

for i, d in enumerate(df_train['ingredients']):
    s = ','.join(d).strip()
    for ingredient in s.split(','):
        X1[i][list_ingredients.index(ingredient)] = 1
    Y1[i] = list_cuisines.index(df_train['cuisine'][i])
    
print "shape of X1: " + str(X1.shape)

shape of X1: (39774, 6725)


In [46]:
# Feature Matrix - Test Data

X2 = np.zeros((len(df_test['ingredients']), len(list_ingredients)))

for i, d in enumerate(df_test['ingredients']):
    s = ','.join(d).strip()
    for ingredient in s.split(','):
        if ingredient in list_ingredients:
            X2[i][list_ingredients.index(ingredient)] = 1
            
print "shape of X2: " + str(X2.shape)

shape of X2: (9944, 6725)


In [82]:
# Cross-validation - Naive Bayes Gaussian prior

cv = KFold(len(df_train))
for trains, tests in cv:
    X1_cv = []
    Y1_cv = []
    X2_cv = []
    Y2_cv = []
    for i in trains:
        X1_cv.append(X1[i])
        Y1_cv.append(list_cuisines.index(df_train['cuisine'][i]))
    for i in tests:
        X2_cv.append(X1[i])
        Y2_cv.append(list_cuisines.index(df_train['cuisine'][i]))
    classifier_ga = naive_bayes.GaussianNB()
    Y2_pred = classifier_ga.fit(X1_cv, Y1_cv).predict(X2_cv)
    n_correct = 0;
    for i in range(0, len(Y2_cv)):
        if Y2_cv[i] == Y2_pred[i]:
            n_correct += 1
    print 'Gaussian ' + str(tests) + ': ' + str(n_correct) + " - " + str(100*float(n_correct)/len(Y2_cv)) + "%"

Gaussian [    0     1     2 ..., 13255 13256 13257]: 5025 - 37.9016442902%
Gaussian [13258 13259 13260 ..., 26513 26514 26515]: 5071 - 38.2486046161%
Gaussian [26516 26517 26518 ..., 39771 39772 39773]: 5007 - 37.7658772062%


In [83]:
# Cross-validation: Naive Bayes Bernoulli prior

cv = KFold(len(df_train))
for trains, tests in cv:
    X1_cv = []
    Y1_cv = []
    X2_cv = []
    Y2_cv = []
    for i in trains:
        X1_cv.append(X1[i])
        Y1_cv.append(list_cuisines.index(df_train['cuisine'][i]))
    for i in tests:
        X2_cv.append(X1[i])
        Y2_cv.append(list_cuisines.index(df_train['cuisine'][i]))
    classifier_be = naive_bayes.BernoulliNB()
    Y2_pred = classifier_be.fit(X1_cv, Y1_cv).predict(X2_cv)
    n_correct = 0;
    for i in range(0, len(Y2_cv)):
        if Y2_cv[i] == Y2_pred[i]:
            n_correct += 1
    print 'Bernoulli ' + str(tests) + ': ' + str(n_correct) + " - " + str(100*float(n_correct)/len(Y2_cv)) + "%"

Bernoulli [    0     1     2 ..., 13255 13256 13257]: 9072 - 68.4266103485%
Bernoulli [13258 13259 13260 ..., 26513 26514 26515]: 9013 - 67.9815960175%
Bernoulli [26516 26517 26518 ..., 39771 39772 39773]: 9104 - 68.6679740534%


In [84]:
# Cross-validation: Logistic Regression

cv = KFold(len(df_train))
for trains, tests in cv:
    X1_cv = []
    Y1_cv = []
    X2_cv = []
    Y2_cv = []
    for i in trains:
        X1_cv.append(X1[i])
        Y1_cv.append(list_cuisines.index(df_train['cuisine'][i]))
    for i in tests:
        X2_cv.append(X1[i])
        Y2_cv.append(list_cuisines.index(df_train['cuisine'][i]))
    classifier_lr = linear_model.LogisticRegression()
    Y2_pred = classifier_lr.fit(X1_cv, Y1_cv).predict(X2_cv)
    n_correct = 0;
    for i in range(0, len(Y2_cv)):
        if Y2_cv[i] == Y2_pred[i]:
            n_correct += 1
    print 'Logistic Regression ' + str(tests) + ': ' + str(n_correct) + " - " + str(100*float(n_correct)/len(Y2_cv)) + "%"

Logistic Regression [    0     1     2 ..., 13255 13256 13257]: 10286 - 77.5833459044%
Logistic Regression [13258 13259 13260 ..., 26513 26514 26515]: 10237 - 77.2137577312%
Logistic Regression [26516 26517 26518 ..., 39771 39772 39773]: 10324 - 77.869965304%


In [86]:
# Logistic Regression on all data

Y_pred = classifier_lr.fit(X1, Y1).predict(X2)

print classification complete''

SyntaxError: invalid syntax (<ipython-input-86-b09a5916c65c>, line 5)