# Create Nutrition dataset
This dataset will be used to retrieve nutrition information for food category

In [5]:
#uncomment if google cloud is not installed

#import sys
#!{sys.executable} -m pip install google-cloud-storage



In [1]:
from google.cloud import storage
import pandas as pd
import json
import numpy as np

## Retrieve data from Google cloud

In [3]:
BUCKET_NAME = 'foodygs'
client = storage.Client()
bucket = client.bucket(BUCKET_NAME)

Get the csv files from the Google Cloud storage

In [147]:
my_prefix = "Nutrition/" # the name of the subfolder

def get_nutrition(my_prefix:str):
    """
    download all the files from google cloud storage to raw_data locally
    """
blobs = bucket.list_blobs(prefix = my_prefix, delimiter = '/')

    for blob in blobs:
        if(blob.name != my_prefix): # ignoring the subfolder itself 
            file_name = '{}/{}'.format('../raw_data', blob.name.replace(my_prefix, ""))
            blob.download_to_filename(file_name)

Get the JSON file from the Google Cloud storage

In [5]:
def json_train():
    storage_filename_annot = 'foodyai_data/Training_1/annotations.json'
    local_filename_annot = "../raw_data/annotations.json"

    blob = bucket.blob(storage_filename_annot)
    blob.download_to_filename(local_filename_annot)

## Create dataframe 

### JSON file (annotations)

In [8]:
js = open(local_filename_annot)
ds_annot = json.load(js)

In [35]:
df_cat = pd.DataFrame(ds_annot['categories'])

In [36]:
df_cat.head()

Unnamed: 0,id,name,name_readable,supercategory
0,1565,bread-wholemeal,"Bread, wholemeal",food
1,2099,jam,Jam,food
2,2578,water,Water,food
3,1556,bread-sourdough,"Bread, sourdough",food
4,1154,banana,Banana,food


### CSV file (nutritions)

In [191]:
df_nut_1 = pd.read_csv('../raw_data/nutrition.csv')
df_nut_2 = pd.read_csv('../raw_data/nutrition2.csv')

In [192]:
df_nut_2.head()
#len(df_nut_3)

Unnamed: 0,index,NDB_No,Shrt_Desc,Water_(g),Energ_Kcal,Protein_(g),Lipid_Tot_(g),Ash_(g),Carbohydrt_(g),Fiber_TD_(g),...,Vit_K_(µg),FA_Sat_(g),FA_Mono_(g),FA_Poly_(g),Cholestrl_(mg),GmWt_1,GmWt_Desc1,GmWt_2,GmWt_Desc2,Refuse_Pct
0,0,1001,"BUTTER,WITH SALT",15.87,717,0.85,81.11,2.11,0.06,0.0,...,7.0,51.368,21.021,3.043,215.0,5.0,"1 pat, (1"" sq, 1/3"" high)",14.2,1 tbsp,0.0
1,1,1002,"BUTTER,WHIPPED,W/ SALT",16.72,718,0.49,78.3,1.62,2.87,0.0,...,4.6,45.39,19.874,3.331,225.0,3.8,"1 pat, (1"" sq, 1/3"" high)",9.4,1 tbsp,0.0
2,2,1003,"BUTTER OIL,ANHYDROUS",0.24,876,0.28,99.48,0.0,0.0,0.0,...,8.6,61.924,28.732,3.694,256.0,12.8,1 tbsp,205.0,1 cup,0.0
3,3,1004,"CHEESE,BLUE",42.41,353,21.4,28.74,5.11,2.34,0.0,...,2.4,18.669,7.778,0.8,75.0,28.35,1 oz,17.0,1 cubic inch,0.0
4,4,1005,"CHEESE,BRICK",41.11,371,23.24,29.68,3.18,2.79,0.0,...,2.5,18.764,8.598,0.784,94.0,132.0,"1 cup, diced",113.0,"1 cup, shredded",0.0


## Create new dataframe
Contains only category present in the JSON file

In [193]:
cat_csv_1 = list(df_nut_1['name'])
cat_csv_2 = list(df_nut_2['Shrt_Desc'])

In [152]:
#cat_json are all the categories available for our classification task
cat_json = list(df_cat['name_readable'])
print(f'There are {len(cat_json)} categories in our classification task')

There are 498 categories in our classification task


In [42]:
#for each cat, keep the name before the comma for simplicity
def split(lst:list):
    """
    give a list of food name and extract only the first part of the name
    before the comma
    lower case the name
    """
    lst_clean = []
    
    for i in lst:
        lst_clean.append(i.split(',')[0].lower())
    
    return lst_clean

In [180]:
def arr(lst:list):
    """
    from a list of words create a list of array of each word separated by a comma
    and lower each word
    """
    
    lst_arr=[]
    
    for i in lst:
        strip = str(i).replace(' ','')
        lst_arr.append(str(i).lower().split(','))
        
    return lst_arr

In [194]:
#cat_json_array = arr(cat_json)
cat_csv_array_1 = arr(cat_csv_1)
cat_csv_array_2 = arr(cat_csv_2)

In [195]:
cat = cat_csv_array_1 + cat_csv_array_2

In [43]:
cat_json_clean = split(cat_json)
cat_csv_clean = split(cat_csv)

In [203]:
df_nut = pd.DataFrame(data=cat_json, columns=['category'])
df_nut['category_clean'] = cat_json_clean
df_nut['category_array'] = cat_json_array
df_nut.head()

Unnamed: 0,category,category_clean,category_array
0,"Bread, wholemeal",bread,"[bread,wholemeal]"
1,Jam,jam,[jam]
2,Water,water,[water]
3,"Bread, sourdough",bread,"[bread,sourdough]"
4,Banana,banana,[banana]


In [45]:
#keep only the columns that we need
df_nut_csv = df_nut_1
df_nut_csv['category_clean'] = cat_csv_clean
#list of nutrition fact we want + name and serving size
nut = ['name','serving_size','calories','total_fat','saturated_fat',
       'cholesterol','sodium','protein','carbohydrate', 'fiber','category_clean']
df_nut_csv = df_nut_csv[nut]
df_nut_csv.head()

Unnamed: 0,name,serving_size,calories,total_fat,saturated_fat,cholesterol,sodium,protein,carbohydrate,fiber,category_clean
0,Cornstarch,100 g,381,0.1g,,0,9.00 mg,0.26 g,91.27 g,0.9 g,cornstarch
1,"Nuts, pecans",100 g,691,72g,6.2g,0,0.00 mg,9.17 g,13.86 g,9.6 g,nuts
2,"Eggplant, raw",100 g,25,0.2g,,0,2.00 mg,0.98 g,5.88 g,3.0 g,eggplant
3,"Teff, uncooked",100 g,367,2.4g,0.4g,0,12.00 mg,13.30 g,73.13 g,8.0 g,teff
4,"Sherbet, orange",100 g,144,2g,1.2g,1mg,46.00 mg,1.10 g,30.40 g,1.3 g,sherbet


## Get categories
Get similar categories between JSON file and CSV (nutritions)

In [214]:
#compare list array from json to list array from csv
# if all elements array json exist in array csv then append to new list
lst_available_csv = []
lst_available_json = []
for i in cat_json_array:
    for j in cat:
        if all(k in j for k in i)==True:
            lst_available_csv.append(j)
            lst_available_json.append(i)
            break

In [215]:
len(lst_available_json)

112

This process is not accurate