In [1]:
#Import Library
from time import time
import matplotlib.pyplot as plt

from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors.nearest_centroid import NearestCentroid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

import pandas as pd
import numpy as np
import scipy as sp

from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

In [2]:
X = pd.read_csv('dataset/dataset.csv')
Y = X['label']
X.head()


Unnamed: 0,fooditem,energy,protein,totallipid(fat),carbohydrates,fiber,sugars,calcium,iron,potassium,sodium,vitamin C,vitamin A,fattyacid_saturated,fattyacid_totaltrans,cholesterol,label
0,NATURAL CALIFORNIA RAISINS,300,2.5,0.0,77.5,5.0,72.5,50,2.7,775.0,25,0.0,0.0,0.0,0,0,2
1,ALMOND COCONUT CARAMELS,429,3.75,17.86,64.29,3.6,50.0,71,2.29,179.0,0,0.0,12.5,0.0,18,0,1
2,A BLEND OF JUICES PRESSED FROM ORGANIC PAPAYAS...,46,0.42,0.0,11.25,0.8,8.75,11,0.75,125.0,8,30.0,0.042,17.0,0,0,1
3,PINEAPPLE DESSERTS,114,0.0,28.41,0.0,28.41,0.0,0,0.0,13.9,0,0.0,0.0,0.0,0,0,1
4,FARFALLE BARILLA PASTA,357,12.5,1.79,75.0,3.6,3.57,0,8.04,0.0,0,1.0,0.0,0.0,0,0,2


In [3]:
#preprocessing
le = preprocessing.LabelEncoder()
X_enc = le.fit_transform(X['fooditem'])
X_dec = le.inverse_transform(X_enc)
print(X_dec)
X = X.drop('fooditem',1)

X['fooditem'] = X_enc

X.head()

['NATURAL CALIFORNIA RAISINS' 'ALMOND COCONUT CARAMELS'
 'A BLEND OF JUICES PRESSED FROM ORGANIC PAPAYAS & APPLES'
 'PINEAPPLE DESSERTS' 'FARFALLE BARILLA PASTA'
 'ORGANIC MEDIUM FRUITY EXTRA VIRGIN OLIVE OIL'
 'ACAI BLUEBERRY POMEGRANATE WATER' 'ACAI GREEN TEA'
 'ACCELERATE MORNING PROTEIN BAR' 'FROZEN SEASONED SPICY SCALLOP'
 'ROASTED GARLIC BREAD' 'CREAMY MILK CHOCOLATE'
 'ACIDULATED HARD CANDY LOLLIPOPS' 'SESAME HALVA' 'BALSAMIC GLAZE'
 'ACKEES IN SALT WATER' 'FARFALLE BARILLA PASTA' 'COLBY JACK CHEESE'
 'CREAM CHEESE BROWNIES' 'FRESH PEACH' 'SMOKED FISH SALAD' 'SPRING WATER']


Unnamed: 0,energy,protein,totallipid(fat),carbohydrates,fiber,sugars,calcium,iron,potassium,sodium,vitamin C,vitamin A,fattyacid_saturated,fattyacid_totaltrans,cholesterol,label,fooditem
0,300,2.5,0.0,77.5,5.0,72.5,50,2.7,775.0,25,0.0,0.0,0.0,0,0,2,14
1,429,3.75,17.86,64.29,3.6,50.0,71,2.29,179.0,0,0.0,12.5,0.0,18,0,1,6
2,46,0.42,0.0,11.25,0.8,8.75,11,0.75,125.0,8,30.0,0.042,17.0,0,0,1,0
3,114,0.0,28.41,0.0,28.41,0.0,0,0.0,13.9,0,0.0,0.0,0.0,0,0,1,16
4,357,12.5,1.79,75.0,3.6,3.57,0,8.04,0.0,0,1.0,0.0,0.0,0,0,2,11


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=99)

print("X_train total length:",len(X_train))
print("X_test total length:",len(X_test))
print("Y_train total length:",len(y_train))

X_train total length: 15
X_test total length: 7
Y_train total length: 15


In [5]:
model_nb = GaussianNB()
model_svm = svm.SVC()
model_lr = LogisticRegression()
model_knn_centroid = NearestCentroid()
model_knn = KNeighborsClassifier()
eclf = VotingClassifier(estimators=[('lr', model_lr), ('knn_centroid', model_knn_centroid), ('gnb', model_nb), ('svc', model_svm), ('knn', model_knn)],
voting='hard', weights=[1,1,1,1,1])

models = [ model_nb, model_svm, model_lr, model_knn_centroid, model_knn, eclf]
model_names = [ "Naive Bayes", "SVM", "Logistic Regression", "Nearest Neighbors using Centroid", "K-nearest Neighbors", "Ensemble"]
    
best_model = None
best_accuracy = 0
best_preds = None

In [6]:
print("Performance of models")
print("======================")
for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    rmse_nb = mean_squared_error(y_test, preds)
    print("Name:", name)
    print("Accuracy score: ", accuracy)
    print("RMSE: ", rmse_nb) 
    if accuracy >= best_accuracy:
        best_accuracy = accuracy
        best_model = model
        best_preds = preds
        
        
        
print("======================")
print("Best model:",best_model)
print("Best accuracy:",best_accuracy)
print("Best predictions:",best_preds)        

Performance of models
Name: Naive Bayes
Accuracy score:  0.714285714286
RMSE:  0.714285714286
Name: SVM
Accuracy score:  0.714285714286
RMSE:  0.714285714286
Name: Logistic Regression
Accuracy score:  0.571428571429
RMSE:  0.428571428571
Name: Nearest Neighbors using Centroid
Accuracy score:  0.285714285714
RMSE:  0.714285714286
Name: K-nearest Neighbors
Accuracy score:  0.714285714286
RMSE:  0.714285714286
Name: Ensemble
Accuracy score:  0.714285714286
RMSE:  0.714285714286
Best model: VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)), ('knn_centroid...owski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'))],
         n_jobs=1, voting='hard', weights=[1, 1, 1, 1, 1])
Best accuracy: 0.7142857142

In [8]:
results = X_train
X_test = X_test.drop('label',1)
X_test['label'] = best_preds
results.append(X_test)
results.head()

Unnamed: 0,energy,protein,totallipid(fat),carbohydrates,fiber,sugars,calcium,iron,potassium,sodium,vitamin C,vitamin A,fattyacid_saturated,fattyacid_totaltrans,cholesterol,label,fooditem
0,300,2.5,0.0,77.5,5.0,72.5,50,2.7,775.0,25,0.0,0.0,0.0,0,0,2,14
18,0,0.0,0.0,0.0,0.0,106.0,0,0.0,0.0,0,8.6,0.0,0.0,0,0,2,9
13,429,3.75,17.86,64.29,3.6,45.0,71,2.29,179.0,0,0.0,12.5,0.0,18,0,1,18
12,300,2.5,0.0,77.5,5.0,98.0,50,2.7,775.0,25,0.0,0.0,0.0,0,0,2,4
20,383,31.91,10.64,44.68,6.4,23.0,213,1.96,128.0,511,0.0,3169.0,8.5,0,11,1,19


In [9]:
X_d = le.inverse_transform(results['fooditem'])
print(X_d)
results = results.drop('fooditem',1)

results['fooditem'] = X_d

results.head()

['NATURAL CALIFORNIA RAISINS' 'CREAM CHEESE BROWNIES' 'SESAME HALVA'
 'ACIDULATED HARD CANDY LOLLIPOPS' 'SMOKED FISH SALAD' 'ACAI GREEN TEA'
 'SPRING WATER' 'FARFALLE BARILLA PASTA'
 'ORGANIC MEDIUM FRUITY EXTRA VIRGIN OLIVE OIL' 'FARFALLE BARILLA PASTA'
 'FRESH PEACH' 'FROZEN SEASONED SPICY SCALLOP'
 'ACCELERATE MORNING PROTEIN BAR' 'PINEAPPLE DESSERTS'
 'ALMOND COCONUT CARAMELS']


In [32]:
food_bucket = results.loc[(results['label'] == 1 )]
food_bucket = food_bucket.sort_values(['sugars'], ascending=[True])
print(" Low sugar food bucket:", len(food_bucket))
print(" Total food items")
print("============")
food_bucket[['fooditem','sugars']]


 Low sugar food bucket: 10
 Total food items


Unnamed: 0,fooditem,sugars
7,2,0.0
5,15,0.0
3,16,0.0
16,11,2.0
21,20,15.0
9,13,21.43
20,19,23.0
8,3,25.53
13,18,45.0
1,6,50.0


In [29]:
food_bucket = results.loc[(results['label'] == 2)]
food_bucket = food_bucket.sort_values(['sugars'], ascending=[True])
print(" Medium sugar food bucket:", len(food_bucket))
print(" Total food items")
print("============")
food_bucket[['fooditem','sugars']]

 Medium sugar food bucket: 5
 Total food items


Unnamed: 0,fooditem,sugars
4,11,3.57
0,14,72.5
19,12,77.0
12,4,98.0
18,9,106.0


In [30]:
food_bucket = results.loc[(results['label'] == 3)]
food_bucket = food_bucket.sort_values(['sugars'], ascending=[True])
print(" High sugar food bucket:", len(food_bucket))
print(" Total food items")
print("============")
food_bucket[['fooditem','sugars']]

 High sugar food bucket: 0
 Total food items


Unnamed: 0,fooditem,sugars
