In [2]:
# 1. import packages
import json
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import KFold

In [3]:
# 2. read data from json
with open('cooking/train.json') as f:
    train_data = json.load(f)

with open('cooking/test.json') as f:
    test_data = json.load(f)

In [4]:
"""
3. Get these metrics:
dishes_count:integer
cuisine:integer
ingredients:list
ingredients_count:integer
"""

dishes_count = len(train_data)
cuisines = set([])
ingredients = set([])

for i in train_data:
    cuisines.add(i['cuisine'])
    ingredients |= set(i['ingredients'])
cuisines = list(cuisines)
ingredients = list(ingredients)
cuisines_count = len(cuisines)
ingredients_count = len(ingredients)

print("unique cuisines: {}".format(cuisines_count))
print("unique ingredients: {}".format(ingredients_count))

unique cuisines: 20
unique ingredients: 6714


In [5]:
"""
4. Represent sample data in a n by d matrix, in which n == sample_numbers, d == unique_ingredients_count.
Mark the element as 1 if the sample has this ingredient.
"""
train_x = np.zeros([dishes_count, ingredients_count])
train_y = np.zeros(dishes_count)

for i, dish in enumerate(train_data):
    for j in dish['ingredients']:
        train_x[i][ingredients.index(j)] = 1
    train_y[i] = cuisines.index(dish['cuisine'])

print("train_x: {}".format(train_x.shape))
# print("train_x: {}".format(train_x))
print("train_y: {}".format(train_y.shape))
# print("train_y: {}".format(train_y))

train_x: (39774, 6714)
train_y: (39774,)


In [12]:
"""
5. 
Naive Bayes Classifier using 3-fold cross validation and:
1) Gaussian distribution prior assumptions
2) Bernuolli distribution prior assumptions

"""
gaussian_avg_accuracies = []
bernoulli_avg_accuracies = []
kf = KFold(n_splits=3)
kf.get_n_splits(train_x)
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    Y_train, Y_test = train_y[train_index], train_y[test_index]
    print("---- GaussianNB ----")
    gaussian = GaussianNB()    
    gaussian.fit(X_train, Y_train)
    g_score = gaussian.score(X_test, Y_test)
    gaussian_avg_accuracies.append(g_score)
    print(g_score)
    print("---- BernoulliNB ----")
    bernoulli = BernoulliNB()
    bernoulli.fit(X_train, Y_train)
    b_score = bernoulli.score(X_test, Y_test)
    bernoulli_avg_accuracies.append(b_score)
    print(b_score)
    
print("Average accuracy of Gassian Naive Bayes: {}".format(np.average(gaussian_avg_accuracies)))
print("Average accuracy of Bermoulli Naive Bayes: {}".format(np.average(bernoulli_avg_accuracies)))

---- GaussianNB ----
0.37901644290239855
---- BernoulliNB ----
0.684190677326897
---- GaussianNB ----
0.3829386031075577
---- BernoulliNB ----
0.6795142555438226
---- GaussianNB ----
0.37758334590435966
---- BernoulliNB ----
0.6869060190073918


In [None]:
"""
6. Discuss the performance of Gaussian prior and Bernuolli prior
"""
# TODO: figure out key metrics to discuss
"""
Analysis:
By doing 3-fold cross-validations on training set:
1. Gaussian Naive Bayes got average accuracy of 0.38
2. Bernoulli Naive Bayes got average accuracy of 0.68
Bernoulli Naive Bayes performs much better in this case, because features of ingredients have "existing" or "non-existing" status.
So Bernoulli could better describe the probability distribution of the features. 
"""

In [15]:
"""
7. Try logistic regression on the training data. Report the average accuracy.
"""
logistic_accuracies = []
# kf = KFold(n_splits=3)
# kf.get_n_splits(train_x)
for train_index, test_index in kf.split(train_x):
    X_train, X_test = train_x[train_index], train_x[test_index]
    Y_train, Y_test = train_y[train_index], train_y[test_index]
    print("---- Logistic Regression ----")
    clf = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(X_train, Y_train)
    score = clf.score(X_test, Y_test)
    logistic_accuracies.append(score)
    print(score)
logistic_avg_accuracy = np.average(logistic_accuracies)
print("Average accuracy of logistic regression: {}".format(logistic_avg_accuracy))

---- Logistic Regression ----
0.7725147081007694
---- Logistic Regression ----
0.7704782018403983
---- Logistic Regression ----
0.7757580328858048
Average accuracy of logistic regression: 0.772916980942


In [6]:
"""
8. Train the best-performed classifier and submit the labels to Kaggle.
"""
# Prepare for test data
test_x = np.zeros([len(test_data), ingredients_count])

for i, dish in enumerate(test_data):
    for j in dish['ingredients']:
        if j in ingredients:
            test_x[i][ingredients.index(j)] = 1


In [7]:
# Use logistic regression
clf_logistic = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial').fit(train_x, train_y)
p_indices = clf_logistic.predict(test_x)

TypeError: list indices must be integers, not numpy.float64

In [31]:
ret = [['id', 'cuisine']]
for i in range(len(test_x)):
#     r = "{},{}".format(test_data[i]['id'], cuisines[int(p_indices[i])])
#     print(r)
    ret.append([test_data[i]['id'], cuisines[int(p_indices[i])]])

In [33]:
import pandas as pd
df = pd.DataFrame(ret)
df.to_csv('./output.csv', header=False, index=False)