# Create Nutrition dataset
This dataset will be used to retrieve nutrition information for food category

In [1]:
#uncomment if google cloud is not installed

#import sys
#!{sys.executable} -m pip install google-cloud-storage

In [2]:
from google.cloud import storage
import pandas as pd
import json
import numpy as np

## Retrieve data from Google cloud

In [3]:
BUCKET_NAME = 'foodygs'
client = storage.Client()
bucket = client.bucket(BUCKET_NAME)

Get the csv files from the Google Cloud storage

In [4]:
my_prefix = "Nutrition/" # the name of the subfolder

def get_nutrition(my_prefix:str):
    """
    download all the files from google cloud storage to raw_data locally
    """
blobs = bucket.list_blobs(prefix = my_prefix, delimiter = '/')

for blob in blobs:
    if(blob.name != my_prefix): # ignoring the subfolder itself 
        file_name = '{}/{}'.format('../raw_data', blob.name.replace(my_prefix, ""))
        blob.download_to_filename(file_name)

Get the JSON file from the Google Cloud storage

In [8]:
def json_train():
    storage_filename_annot = 'foodyai_data/Training_1/annotations.json'
    local_filename_annot = "../raw_data/annotations.json"

    blob = bucket.blob(storage_filename_annot)
    blob.download_to_filename(local_filename_annot)
    
    return local_filename_annot

## Create dataframe 

### JSON file (annotations)

In [11]:
js = open(json_train())
ds_annot = json.load(js)

In [12]:
df_cat = pd.DataFrame(ds_annot['categories'])

In [13]:
df_cat.head()

Unnamed: 0,id,name,name_readable,supercategory
0,1565,bread-wholemeal,"Bread, wholemeal",food
1,2099,jam,Jam,food
2,2578,water,Water,food
3,1556,bread-sourdough,"Bread, sourdough",food
4,1154,banana,Banana,food


### CSV file (nutritions)

In [83]:
df_nut_1 = pd.read_csv('../raw_data/nutrition.csv')
df_nut_2 = pd.read_csv('../raw_data/nutrition2.csv')
df_nut_3 = pd.read_table('../raw_data/nutrition3.tsv')

  df_nut_3 = pd.read_table('../raw_data/nutrition3.tsv')


In [85]:
df_nut_3.head()

Unnamed: 0,code,url,creator,created_t,created_datetime,last_modified_t,last_modified_datetime,product_name,generic_name,quantity,...,fruits-vegetables-nuts_100g,fruits-vegetables-nuts-estimate_100g,collagen-meat-protein-ratio_100g,cocoa_100g,chlorophyl_100g,carbon-footprint_100g,nutrition-score-fr_100g,nutrition-score-uk_100g,glycemic-index_100g,water-hardness_100g
0,3087,http://world-en.openfoodfacts.org/product/0000...,openfoodfacts-contributors,1474103866,2016-09-17T09:17:46Z,1474103893,2016-09-17T09:18:13Z,Farine de blé noir,,1kg,...,,,,,,,,,,
1,4530,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Banana Chips Sweetened (Whole),,,...,,,,,,,14.0,14.0,,
2,4559,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489069957,2017-03-09T14:32:37Z,1489069957,2017-03-09T14:32:37Z,Peanuts,,,...,,,,,,,0.0,0.0,,
3,16087,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489055731,2017-03-09T10:35:31Z,1489055731,2017-03-09T10:35:31Z,Organic Salted Nut Mix,,,...,,,,,,,12.0,12.0,,
4,16094,http://world-en.openfoodfacts.org/product/0000...,usda-ndb-import,1489055653,2017-03-09T10:34:13Z,1489055653,2017-03-09T10:34:13Z,Organic Polenta,,,...,,,,,,,,,,


## Create new dataframe
Contains only category present in the JSON file

In [84]:
cat_csv_1 = list(df_nut_1['name'])
cat_csv_2 = list(df_nut_2['Shrt_Desc'])
cat_csv_3 = list(df_nut_3['product_name'])

In [16]:
#cat_json are all the categories available for our classification task
cat_json = list(df_cat['name_readable'])
print(f'There are {len(cat_json)} categories in our classification task')

There are 498 categories in our classification task


In [17]:
#for each cat, keep the name before the comma for simplicity
def split(lst:list):
    """
    give a list of food name and extract only the first part of the name
    before the comma
    lower case the name
    """
    lst_clean = []
    
    for i in lst:
        lst_clean.append(i.split(',')[0].lower())
    
    return lst_clean

In [86]:
def arr(lst:list):
    """
    from a list of words create a list of array of each word separated by a comma
    and lower each word
    """
    
    lst_arr=[]
    
    for i in lst:
        #strip = str(i).replace(' ','')
        lst_arr.append(str(i).lower().split(' '))
        
    return lst_arr

In [19]:
#create the array from function above
cat_csv_array_1 = arr(cat_csv_1)
cat_csv_array_2 = arr(cat_csv_2)

In [87]:
cat_csv_array_3 = arr(cat_csv_3)

In [88]:
#concatenate the 2 lists to get more category --> more chance to find similarities
cat_csv_array = cat_csv_array_1 + cat_csv_array_2 + cat_csv_array_3

In [91]:
len(cat_csv_array)

373606

In [24]:
#do the same for the json file
cat_json_clean = split(cat_json)
cat_json_array = arr(cat_json)

In [25]:
#create a clean dataset from the json category --> to be used for nutrition values
df_nut = pd.DataFrame(data=cat_json, columns=['category'])
df_nut['category_clean'] = cat_json_clean
df_nut['category_array'] = cat_json_array
df_nut.head()

Unnamed: 0,category,category_clean,category_array
0,"Bread, wholemeal",bread,"[bread, wholemeal]"
1,Jam,jam,[jam]
2,Water,water,[water]
3,"Bread, sourdough",bread,"[bread, sourdough]"
4,Banana,banana,[banana]


## Get categories
Get similar categories between JSON file and CSV (nutritions)

In [89]:
# compare list array from json to list array from csv
# if all elements array json exist in array csv 
# and len(array json)/len(array csv) is between 0.5 and 1
# then append to new list
lst_available_csv = []
lst_available_json = []
for i in cat_json_array:
    for j in cat_csv_array:
        if all(k in j for k in i)==True:
            if len(i)==1 and 'raw' in j:
                lst_available_csv.append(j)
                lst_available_json.append(i)
            elif len(i)>1 and 0.2<=len(i)/len(j)<=1:
                lst_available_csv.append(j)
                lst_available_json.append(i)
            break

In [90]:
len(lst_available_json)

25

In [79]:
lst_available_csv

[['pepper', 'banana', 'raw'],
 ['juice',
  ' with added ascorbic acid and calcium',
  ' grape and pear blend',
  ' apple'],
 ['soup', ' condensed', ' low sodium', ' canned', ' vegetable'],
 ['ham', ' cooked', ' smoked', ' honey'],
 ['squash', 'smmr', 'zucchini', 'incl skn', 'raw'],
 ['cake', ' dry mix', ' chocolate', ' pudding-type'],
 ['applesauce', ' with added ascorbic acid', ' unsweetened', ' canned'],
 ['bread', ' enriched', ' white', ' pita'],
 ['tuna', 'fresh', 'bluefin', 'raw'],
 ['cod', 'atlantic', 'raw'],
 ['chestnuts', 'chinese', 'raw'],
 ['bread', ' whole wheat', ' french or vienna'],
 ['chicken', ' barbecue flavored', ' glazed', ' frozen', ' wing'],
 ['bread', ' enriched', ' white', ' pita'],
 ['sauce', ' refrigerated', ' ready-to-serve', ' pesto'],
 ['spinach', ' raw'],
 ['chicken', ' raw', ' meat and skin', ' breast', ' broilers or fryers'],
 ['beans', ' raw', ' mature seeds', ' red', ' kidney'],
 ['bread', ' toasted', ' rye'],
 ['soup', ' condensed', ' canned', ' tomato

In [80]:
lst_available_json

[['banana'],
 ['juice', ' apple'],
 ['soup', ' vegetable'],
 ['ham', ' cooked'],
 ['zucchini'],
 ['cake', ' chocolate'],
 ['applesauce', ' unsweetened', ' canned'],
 ['bread', ' white'],
 ['tuna'],
 ['cod'],
 ['chestnuts'],
 ['bread', ' whole wheat'],
 ['chicken', ' wing'],
 ['bread', ' pita'],
 ['sauce', ' pesto'],
 ['spinach', ' raw'],
 ['chicken', ' breast'],
 ['beans', ' kidney'],
 ['bread', ' rye'],
 ['soup', ' tomato'],
 ['chicken', ' leg'],
 ['sauce', ' cocktail'],
 ['beans', ' white'],
 ['cake', ' marble'],
 ['beef', ' roast']]

This process is not accurate
<br>
<br>
Here is an example:
<br>
JSON category = 'Avocado'
<br>
CSV category = 'raw avocado' and 'oil avocado'
<br>
<br>
'raw avocado' is obviously the good one. But how to tell the algo without using NLP?