In [None]:
from pathlib import Path
from pprint import pprint
from types import SimpleNamespace
from IPython.display import display
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

# Thay đổi đường dẫn đến ổ đĩa hoặc thư mục gốc của dự án của bạn
drive_path = "./drive/MyDrive"

# Sử dụng os.path.join để kết hợp đường dẫn
root_dir = os.path.join(drive_path, "FoodCaloModel")

# Kiểm tra xem thư mục tồn tại
if os.path.exists(root_dir) and os.path.isdir(root_dir):
    print(f"Thư mục gốc 'FoodCaloModel' nằm ở: {root_dir}")
else:
    print("Không tìm thấy thư mục gốc 'FoodCaloModel'. Vui lòng kiểm tra lại đường dẫn của bạn.")


Thư mục gốc 'FoodCaloModel' nằm ở: ./drive/MyDrive/FoodCaloModel


### Google Universal Sentence Encoder for word embeddings

In [None]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)


def embed(input):
    return model(input)


if __name__ == "__main__":
    print("module %s loaded" % module_url)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


### Create embeddings for all items in nutrition database

In [None]:
if __name__ == "__main__":
    display(embed(["poultry and chicken", "i love milk"]))

<tf.Tensor: shape=(2, 512), dtype=float32, numpy=
array([[ 0.00842836,  0.0064201 ,  0.05519095, ...,  0.00923468,
         0.0458016 , -0.04145263],
       [ 0.04911731, -0.00836502,  0.02530874, ..., -0.03663807,
         0.02890903, -0.07283901]], dtype=float32)>

In [None]:
from pathlib import Path

# Đảm bảo root_dir đã được xác định trước đó và là một đối tượng Path
root_dir = Path(root_dir)

# Sử dụng Path để nối đường dẫn thư mục
fndds_dir = root_dir / "Food Datasets" / "USDA-FNDDS"

# Mở tệp
with open((fndds_dir / "cleaned_food_category.txt").resolve(strict=True), "r") as file:
    fndds_category = file.read().split("\n")
    fndds_category.pop(-1)  # remove last line (empty)


In [None]:
if __name__ == "__main__":
    pprint(fndds_category)

['fried rice and lo mein or chow mein',
 'citrus fruits',
 'processed soy products',
 'other vegetables and combinations',
 'dried fruits',
 'beans and peas and legumes',
 'soft drinks',
 'frankfurters',
 'apple juice',
 'pasta and noodles and cooked grains',
 'turkey and duck and other poultry',
 'nachos',
 'macaroni and cheese',
 'liver and organ meats',
 'apples',
 'cabbage',
 'pancakes and waffles and french toast',
 'pizza',
 'poultry mixed dishes',
 'lamb and goat and game',
 'frankfurter sandwiches',
 'tortilla and corn and other chips',
 'margarine',
 'soy-based condiments',
 'jams and syrups and toppings',
 'ice cream and frozen dairy desserts',
 'cottage cheese or ricotta cheese',
 'saltine crackers',
 'regular yogurt',
 'pork',
 'other fruits and fruit salads',
 'onions',
 'nuts and seeds',
 'rolls and buns',
 'broccoli',
 'chicken in whole pieces',
 'lowfat milk',
 'mustard and other condiments',
 'grapes',
 'blueberries and other berries',
 'nonfat milk',
 'rice mixed dish

In [None]:
df_fndds_nutrient_values = pd.read_csv(
    (fndds_dir / "cleaned_fndds_nutrient_values.csv").resolve(strict=True), sep="\t"
)

In [None]:
if __name__ == "__main__":
    display(df_fndds_nutrient_values)

Unnamed: 0,Main food description,WWEIA Category description,Energy (kcal),Protein (g),Carbohydrate (g),Total Fat (g)
0,"milk, not further specified",reduced fat milk,0.51,0.0334,0.0487,0.0199
1,"milk, whole",whole milk,0.60,0.0328,0.0467,0.0320
2,"milk, low sodium, whole",whole milk,0.61,0.0310,0.0446,0.0346
3,"milk, calcium fortified, whole",whole milk,0.60,0.0328,0.0467,0.0320
4,"milk, calcium fortified, low fat",lowfat milk,0.43,0.0338,0.0519,0.0095
...,...,...,...,...,...,...
6085,gin,liquor and cocktails,2.31,0.0000,0.0000,0.0000
6086,rum,liquor and cocktails,2.31,0.0000,0.0000,0.0000
6087,rum cooler,liquor and cocktails,0.68,0.0000,0.1007,0.0000
6088,vodka,liquor and cocktails,2.31,0.0000,0.0000,0.0000


In [None]:
fndds_category_embeddings = embed(fndds_category)

In [None]:
fndds_description_embeddings = {}
fndds_description = {}
for category in fndds_category:
    all_food_in_category = df_fndds_nutrient_values[
        df_fndds_nutrient_values["WWEIA Category description"] == category
    ]["Main food description"].tolist()
    fndds_description_embeddings[category] = embed(all_food_in_category)
    fndds_description[category] = all_food_in_category

In [None]:
if __name__ == "__main__":
    print("Below is an overview of FNDDS category embeddings\n")
    pprint(fndds_category_embeddings)

Below is an overview of FNDDS category embeddings

<tf.Tensor: shape=(134, 512), dtype=float32, numpy=
array([[-0.02513325, -0.03357036,  0.05087716, ..., -0.03891268,
        -0.04701412, -0.07346293],
       [ 0.03919313,  0.01966225, -0.03255866, ...,  0.02130345,
         0.05326367, -0.00397852],
       [ 0.00456416, -0.03524939,  0.01208728, ...,  0.06171183,
        -0.04190361, -0.06769847],
       ...,
       [ 0.01955689,  0.03285935,  0.06863839, ..., -0.01266976,
         0.07735244, -0.06838692],
       [-0.04366885,  0.04003096,  0.0333304 , ..., -0.00228094,
         0.0300426 , -0.05462102],
       [-0.03297438,  0.0342321 ,  0.04971401, ...,  0.04214758,
        -0.01720789, -0.04030016]], dtype=float32)>


In [None]:
def get_cosine_similarity(matrix_embedding, target_vector_embedding):
    # compute the inner product
    similarity = tf.linalg.matvec(matrix_embedding, target_vector_embedding)
    return similarity


def get_most_similar_from_fndds(target_vector_embedding):
    category_similarity = get_cosine_similarity(
        fndds_category_embeddings, target_vector_embedding
    )
    category_value, category_index = tf.math.top_k(category_similarity, k=1)
    category_index = tf.reshape(category_index, [1])[0].numpy()
    category = get_category_from_fndds(category_index)
    similarity = get_cosine_similarity(
        fndds_description_embeddings[category], target_vector_embedding
    )
    value, index = tf.math.top_k(similarity, k=1)
    value = tf.reshape(value, [1])[0].numpy()
    index = tf.reshape(index, [1])[0].numpy()
    return (value, index, category)



def get_category_from_fndds(index):
    return fndds_category[index]


def get_ingredient_nutrient_from_fndds(category, index):
    ingredient_name = fndds_description[category][index]
    result = df_fndds_nutrient_values.loc[
        df_fndds_nutrient_values["Main food description"] == ingredient_name, :
    ]
    return result.squeeze()

In [None]:
exported = {
    fn.__name__: fn
    for fn in [
        get_ingredient_nutrient_from_fndds,
        get_most_similar_from_fndds,
        embed,
    ]
}

In [None]:
exported = SimpleNamespace(**exported)

In [None]:
if __name__ != "__main__":
    print("Module ingredient_embeddings_similarity.ipynb is loaded")