In [1]:
# Data transformations
import pandas as pd
import numpy as np

# Modeling
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression

# Validation
from sklearn.model_selection import KFold

train_df = pd.read_json('train.json')
test_df = pd.read_json('test.json')

In [2]:
# Creates set of all ingredients
def create_ingredient_set(data):
    
    ingredient_set = set()
    
    for list_of_ingredients in data['ingredients']:
        for ingredient in list_of_ingredients:
            ingredient_set.add(ingredient)
            
    ingredient_set = sorted(ingredient_set)
    
    return ingredient_set

print('There are {} samples'.format(train_df.shape[0]))
print('There are {} categories'.format(train_df['cuisine'].value_counts().shape[0]))
print('There are {} unique ingredients'.format(len(create_ingredient_set(train_df))))

There are 39774 samples
There are 20 categories
There are 6714 unique ingredients


### Representing each dish as a binary ingredient feature vector: 

In [3]:
# Main function to call other helper functions, returns finalized dataframe to perform machine learning on
def create_ingredient_df(data):
    
    ingredient_set = create_ingredient_set(data)
    dishes_df = create_dishes_df(data, ingredient_set)
    final_df = pd.concat([data, dishes_df], join = 'outer', axis = 1)
    
    return final_df

In [4]:
# Creates dataframe of ingredient presence in recipe
def create_dishes_df(data, ingredient_set):
    
    contain_ingredient = []
    
    for list_of_ingredients in data['ingredients']:
        current_dish = []
        for ingredient in ingredient_set:
            if ingredient in list_of_ingredients:
                current_dish.append(1)
            else:
                current_dish.append(0)
        contain_ingredient.append(current_dish) 
        
    dishes = pd.DataFrame(contain_ingredient, columns = ingredient_set)
    
    return dishes

In [5]:
# Calls functions and saves into CSV files

train_data = create_ingredient_df(train_df)
test_data = create_ingredient_df(test_df)

# train_data.to_csv('train.csv', index = False)
# test_data.to_csv('test.csv', index = False)

Bernoulli outperformed Gaussian in terms of cross validation accuracy. This is because of data set does not follow the normal distribution. It performs much better, at almost twice the efficiency, because bernoulli is a more accurate representation of our data set.

### Using Naïve Bayes Classifier with Gaussian distribution prior assumption and Bernoulli distribution prior assumption to perform 3 fold cross-validation on the training set:

### Using Logistic Regression Model to perform 3 fold cross-validation on the training set:

In [6]:
label_predict, label_actual = [], []
kf = KFold(n_splits=3, shuffle=False)
count = 0
correct_gaussian = 0
correct_bernoulli = 0
correct_logistic = 0
for train_index, test_index in kf.split(train_data):
    train_fold = train_data.iloc[train_index,3:].values.tolist()
    train_label = train_data.iloc[train_index,0].values.tolist()
    test_fold = train_data.iloc[test_index,3:].values.tolist()
    test_label = train_data.iloc[test_index,0].values.tolist()
    
    clf_gaussian = GaussianNB().fit(train_fold,train_label)
    clf_bernoulli = BernoulliNB().fit(train_fold,train_label)
    clf_logistic = LogisticRegression().fit(train_fold,train_label)    
    
    for dish in range(len(test_fold)):
        count += 1
        if clf_gaussian.predict(np.array(test_fold[dish]).reshape(1, -1)) == test_label[dish]:
            correct_gaussian += 1
        if clf_bernoulli.predict(np.array(test_fold[dish]).reshape(1, -1)) == test_label[dish]:
            correct_bernoulli += 1
        if clf_logistic.predict(np.array(test_fold[dish]).reshape(1, -1)) == test_label[dish]:
            correct_logistic += 1
            
print("Gaussian: ",correct_gaussian/count)
print("Bernoulli: ",correct_bernoulli/count)
print("Logistic:", correct_logistic/count)



Gaussian:  0.3798461306381053
Bernoulli:  0.6835369839593705
Logistic: 0.7755568964650275


In [7]:
# Created test data based on the list of ingredients from the training data
test_data = pd.read_json('test.json')

ingredient_set = create_ingredient_set(train_data)

contain_ingredient = []
for list_of_ingredients in test_data['ingredients']:
    current_dish = []
    for ingredient in ingredient_set:
        if ingredient in list_of_ingredients:
            current_dish.append(1)
        else:
            current_dish.append(0)
    contain_ingredient.append(current_dish) 
    
test_data = pd.concat([test_data,pd.DataFrame(contain_ingredient,columns=ingredient_set)],join='outer',axis=1)

# Fit logistic regression and store in csv for Kaggle competition submission
clf_logistic = LogisticRegression().fit(train_data.iloc[:,3:].values.tolist(),train_data.iloc[:,0].values.tolist())    
predict_values = pd.DataFrame(columns=['id','cuisine'])

for dish in range(len(test_data)): 
    predict_values.loc[dish] = (test_data.iloc[dish,0],clf_logistic.predict(np.array(test_data.iloc[dish,2:]).reshape(1, -1))[0])
    
predict_values.to_csv('results.csv', index = False)

