In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [2]:
data = pd.read_csv('food_data.csv')  # load our data
data

Unnamed: 0,Food,Calories,Calories from Fat,Total Fat,Total Fat.1,Sodium,Sodium.1,Potassium,Potassium.1,Total Carbo-hydrate,Total Carbo-hydrate.1,Protein,Vitamin A,Vitamin C,Calcium,Iron,Food Type
0,Asparagus,20,0,0.0,0,0,0,230,7,4,1,2,10,15,2,2,Vegetables
1,Bell Pepper,25,0,0.0,0,40,2,220,6,6,2,1,4,190,2,4,Vegetables
2,Broccoli,45,0,0.5,1,80,3,460,13,8,3,4,6,220,6,6,Vegetables
3,Carrot,30,0,0.0,0,60,3,250,7,7,2,1,110,10,2,2,Vegetables
4,Cauliflower,25,0,0.0,0,30,1,270,8,5,2,2,0,100,2,2,Vegetables
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,Scallops,140,10,1.0,2,310,13,430,12,5,2,27,2,0,4,14,Seafood
57,Shrimp,100,10,1.5,2,240,10,220,6,0,0,21,4,4,6,10,Seafood
58,Swordfish,120,50,6.0,9,100,4,310,9,0,0,16,2,2,0,6,Seafood
59,Tilapia,110,20,2.5,4,30,1,360,10,0,0,22,0,2,0,2,Seafood


In [12]:
data.index = data.iloc[:,0] # Name each row of the food column as food name for reference
data_to_use = data.iloc[:,3:15] #columns 3 to 15
data_to_target = data.iloc[:,16] #last column
data.index

Index(['Asparagus', 'Bell Pepper', 'Broccoli', 'Carrot', 'Cauliflower',
       'Celery', 'Cucumber', 'Green (Snap) Beans', 'Green Cabbage',
       'Green Onion', 'Iceberg Lettuce', 'Leaf Lettuce', 'Mushrooms', 'Onion',
       'Potato', 'Radishes', 'Summer Squash', 'Sweet Corn', 'Sweet Potato',
       'Tomato', 'Apple', 'Avocado', 'Banana', 'Cantaloupe', 'Grapefruit',
       'Grapes', 'Honeydew Melon', 'Kiwifruit', 'Lemon', 'Lime', 'Nectarine',
       'Orange', 'Peach', 'Pear', 'Pineapple', 'Plums', 'Strawberries',
       'Sweet Cherries', 'Tangerine', 'Watermelon', 'Blue Crab', 'Catfish',
       'Clams', 'Cod', 'Flounder/Sole', 'Haddock', 'Halibut', 'Lobster',
       'Ocean Perch', 'Orange Roughy', 'Oysters', 'Pollock', 'Rainbow Trout',
       'Rockfish', 'Salmon, Atlantic/Coho/Sockeye /Chinook', 'Salmon, Pink',
       'Scallops', 'Shrimp', 'Swordfish', 'Tilapia', 'Tuna'],
      dtype='object', name='Food')

In [13]:
data_to_use

Unnamed: 0_level_0,Total Fat,Total Fat.1,Sodium,Sodium.1,Potassium,Potassium.1,Total Carbo-hydrate,Total Carbo-hydrate.1,Protein,Vitamin A,Vitamin C,Calcium
Food,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Asparagus,0.0,0,0,0,230,7,4,1,2,10,15,2
Bell Pepper,0.0,0,40,2,220,6,6,2,1,4,190,2
Broccoli,0.5,1,80,3,460,13,8,3,4,6,220,6
Carrot,0.0,0,60,3,250,7,7,2,1,110,10,2
Cauliflower,0.0,0,30,1,270,8,5,2,2,0,100,2
...,...,...,...,...,...,...,...,...,...,...,...,...
Scallops,1.0,2,310,13,430,12,5,2,27,2,0,4
Shrimp,1.5,2,240,10,220,6,0,0,21,4,4,6
Swordfish,6.0,9,100,4,310,9,0,0,16,2,2,0
Tilapia,2.5,4,30,1,360,10,0,0,22,0,2,0


In [14]:
data_to_target

Food
Asparagus      Vegetables
Bell Pepper    Vegetables
Broccoli       Vegetables
Carrot         Vegetables
Cauliflower    Vegetables
                  ...    
Scallops          Seafood
Shrimp            Seafood
Swordfish         Seafood
Tilapia           Seafood
Tuna              Seafood
Name: Food Type, Length: 61, dtype: object

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data_to_use, data_to_target, test_size=0.3, random_state=21) # 70% training, 30% test


In [7]:
gaussian_model = GaussianNB() # Gaussian model

In [8]:
gaussian_model.fit(X_train, y_train) # Training the model

In [17]:
prediction = gaussian_model.predict(X_test) # Predict the response for food
prediction

array(['Vegetables', 'Seafood', 'Fruits', 'Fruits', 'Fruits',
       'Vegetables', 'Fruits', 'Vegetables', 'Vegetables', 'Seafood',
       'Seafood', 'Vegetables', 'Vegetables', 'Fruits', 'Vegetables',
       'Fruits', 'Seafood', 'Vegetables', 'Vegetables'], dtype='<U10')

In [15]:
result = {'food_type':y_test, 'pred':prediction}
result_df = pd.DataFrame(result)
result

{'food_type': Food
 Cantaloupe                Fruits
 Oysters                  Seafood
 Sweet Corn            Vegetables
 Banana                    Fruits
 Broccoli              Vegetables
 Tomato                Vegetables
 Orange                    Fruits
 Green (Snap) Beans    Vegetables
 Lemon                     Fruits
 Pollock                  Seafood
 Avocado                   Fruits
 Grapefruit                Fruits
 Honeydew Melon            Fruits
 Peach                     Fruits
 Potato                Vegetables
 Nectarine                 Fruits
 Shrimp                   Seafood
 Carrot                Vegetables
 Leaf Lettuce          Vegetables
 Name: Food Type, dtype: object,
 'pred': array(['Vegetables', 'Seafood', 'Fruits', 'Fruits', 'Fruits',
        'Vegetables', 'Fruits', 'Vegetables', 'Vegetables', 'Seafood',
        'Seafood', 'Vegetables', 'Vegetables', 'Fruits', 'Vegetables',
        'Fruits', 'Seafood', 'Vegetables', 'Vegetables'], dtype='<U10')}

In [16]:
result_df

Unnamed: 0_level_0,food_type,pred
Food,Unnamed: 1_level_1,Unnamed: 2_level_1
Cantaloupe,Fruits,Vegetables
Oysters,Seafood,Seafood
Sweet Corn,Vegetables,Fruits
Banana,Fruits,Fruits
Broccoli,Vegetables,Fruits
Tomato,Vegetables,Vegetables
Orange,Fruits,Fruits
Green (Snap) Beans,Vegetables,Vegetables
Lemon,Fruits,Vegetables
Pollock,Seafood,Seafood


In [11]:
print("Accuracy:", metrics.accuracy_score(y_test, prediction)) # Checking NB accuracy

Accuracy: 0.631578947368421
