In [1]:
from dotenv import load_dotenv
load_dotenv('../.vscode/.env')

True

In [2]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_recipes.json') as f:
    gpt4_recipe = json.load(f)

In [5]:
from typing import  List, Optional
from pydantic import BaseModel
class Nutrition(BaseModel):
    fat: Optional[str] = None
    protein: Optional[str] = None
    carbohydrate: Optional[str] = None

class Recipe(BaseModel):
    id: str
    recipe_name:str
    ingredients: List[dict]
    directions: str
    nutrition: Nutrition
    total_calories_estimation: str  

class Recipes(BaseModel):
    recipes: List[Recipe] = []

In [6]:
recipes = Recipes(**gpt4_recipe)

In [7]:
required_recipe_ids = ['27','29', '31','62','63','64']

In [8]:
required_recipes = Recipes()
for recipe in recipes.recipes:
    if recipe.id in required_recipe_ids:
        required_recipes.recipes.append(recipe)

In [9]:
required_recipes

Recipes(recipes=[Recipe(id='63', recipe_name='Low-Cholesterol Vanilla Ice Cream', ingredients=[{'name': 'unsweetened almond milk', 'quantity': '2 cups'}, {'name': 'coconut cream', 'quantity': '1 cup'}, {'name': 'pure vanilla extract', 'quantity': '2 teaspoons'}, {'name': 'maple syrup', 'quantity': '1/2 cup'}, {'name': 'cornstarch', 'quantity': '2 tablespoons'}, {'name': 'salt', 'quantity': '1/4 teaspoon'}], directions="1. Mix Ingredients: In a medium saucepan, combine the almond milk, coconut cream, maple syrup, cornstarch, and salt. Whisk them together until the cornstarch is fully dissolved.\n2. Cook: Place the saucepan over medium heat. Cook the mixture, stirring constantly, until it begins to thicken slightly and just begins to bubble. Then remove from heat and stir in vanilla extract.\n3. Chill: Transfer the mixture to a bowl and cover it with plastic wrap, pressing the wrap directly onto the surface to prevent a skin from forming. Chill in the refrigerator for at least 4 hours, u

In [80]:
import json
import logging
import os

import openai

from typing import Optional
from IPython.display import display, Markdown
from tenacity import retry, wait_random_exponential, stop_after_attempt

logging.basicConfig(level=logging.INFO, format=' %(asctime)s - %(levelname)s - %(message)s')

OPENAI_MODEL = 'gpt-4-turbo-2024-04-09'

In [72]:
labels = [
    "person",      # people, including fictional characters
    "fac",         # buildings, airports, highways, bridges
    "org",         # organizations, companies, agencies, institutions
    "gpe",         # geopolitical entities like countries, cities, states
    "loc",         # non-gpe locations
    "product",     # vehicles, foods, appareal, appliances, software, toys 
    "event",       # named sports, scientific milestones, historical events
    "work_of_art", # titles of books, songs, movies
    "law",         # named laws, acts, or legislations
    "language",    # any named language
    "date",        # absolute or relative dates or periods
    "time",        # time units smaller than a day
    "percent",     # percentage (e.g., "twenty percent", "18%")
    "money",       # monetary values, including unit
    "quantity",    # measurements, e.g., weight or distance
]

In [73]:
def system_message(labels):
    return f"""
You are an expert in Natural Language Processing. Your task is to identify common Named Entities (NER) in a given text.
The possible common Named Entities (NER) types are exclusively: ({", ".join(labels)})."""


In [74]:
def assisstant_message():
    return f"""
EXAMPLE:
    Text: '- unsweetened almond milk: 2 cups
- coconut cream: 1 cup
- pure vanilla extract: 2 teaspoons
- maple syrup: 1/2 cup
- cornstarch: 2 tablespoons
- salt: 1/4 teaspoon'
    {{
        "product": ["almond milk","coconut cream", "vanilla extract", "maple syrup", "cornstarch", "salt"],
    }}
--"""

In [75]:
def user_message(text):
    return f"""
TASK:
    Text: {text}
"""

In [76]:
def recipe_ingredients(text: str, label_entities: dict) -> list:
    """
    Recipe ingredients
    """
    return label_entities['product']

In [77]:
def generate_functions(labels: dict) -> list:
    return [
        {   
            "type": "function",
            "function": {
                "name": "recipe_ingredients",
                "description": "Enrich Text with recipe ingredients",
                "parameters": {
                    "type": "object",
                        "properties": {
                            "r'^(?:' + '|'.join({labels}) + ')$'": 
                            {
                                "type": "array",
                                "items": {
                                    "type": "string"
                                }
                            }
                        },
                        "additionalProperties": False
                },
            }
        }
    ]

In [81]:
@retry(wait=wait_random_exponential(min=1, max=10), stop=stop_after_attempt(5))
def run_openai_task(labels, text):
    messages = [
          {"role": "system", "content": system_message(labels=labels)},
          {"role": "assistant", "content": assisstant_message()},
          {"role": "user", "content": user_message(text=text)}
      ]

    # TODO: functions and function_call are deprecated, need to be updated
    # See: https://platform.openai.com/docs/api-reference/chat/create#chat-create-tools
    response = openai.chat.completions.create(
        model=OPENAI_MODEL,
        messages=messages,
        tools=generate_functions(labels),
        tool_choice={"type": "function", "function" : {"name": "recipe_ingredients"}}, 
        temperature=0,
        frequency_penalty=0,
        presence_penalty=0,
    )

    response_message = response.choices[0].message
    
    available_functions = {"recipe_ingredients": recipe_ingredients}  
    function_name = response_message.tool_calls[0].function.name
    
    function_to_call = available_functions[function_name]
    logging.info(f"function_to_call: {function_to_call}")

    function_args = json.loads(response_message.tool_calls[0].function.arguments)
    logging.info(f"function_args: {function_args}")

    function_response = function_to_call(text, function_args)

    return {"model_response": response, 
            "function_response": function_response}

In [211]:
text = """
yellow bell pepper
"""
result = run_openai_task(labels, text)

 2024-04-16 00:51:13,091 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 2024-04-16 00:51:13,114 - INFO - function_to_call: <function recipe_ingredients at 0x136422520>
 2024-04-16 00:51:13,115 - INFO - function_args: {'product': ['yellow bell pepper']}


In [None]:
result

{'model_response': ChatCompletion(id='chatcmpl-9EUt1K5PE7gvmEauVpsFEiMCfKHXG', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content=None, role='assistant', function_call=None, tool_calls=[ChatCompletionMessageToolCall(id='call_jdwd2I646K6fDIS4Avhl8cuJ', function=Function(arguments='{"product":["olive oil","onion","garlic","carrots","zucchini","green beans","celery","vegetable broth","diced tomatoes","thyme","black pepper","spinach"]}', name='recipe_ingredients'), type='function')]))], created=1713242347, model='gpt-4-turbo-2024-04-09', object='chat.completion', system_fingerprint='fp_76f018034d', usage=CompletionUsage(completion_tokens=42, prompt_tokens=392, total_tokens=434)),
 'function_response': ['olive oil',
  'onion',
  'garlic',
  'carrots',
  'zucchini',
  'green beans',
  'celery',
  'vegetable broth',
  'diced tomatoes',
  'thyme',
  'black pepper',
  'spinach']}

In [40]:
from typing import  Optional

class USDAFoodIngredient(BaseModel):
    fdc_id: int
    name: str
    nutrition_id: int
    energy: float
    unit: str 

class Ingredient(BaseModel):
    name: str
    quantity: str
    ner_name: str

class EnrichRecipe(BaseModel):
    id: str
    recipe_name:str
    ingredients: List[Ingredient]
    directions: str
    nutrition: Nutrition
    total_calories_estimation: str  

class EnrichRecipes(BaseModel):
    recipes: List[EnrichRecipe] = []

class IngredientWithEngergy(BaseModel):
    name: str
    quantity: str
    ner_name: str
    usda_food_ingredient:Optional[USDAFoodIngredient]=None

class EnrichRecipeWithEnergy(BaseModel):
    id: str
    recipe_name:str
    ingredients: List[IngredientWithEngergy]
    directions: str
    nutrition: Nutrition
    total_calories_estimation: str 

class EnrichRecipesWithEnergy(BaseModel):
    recipes: List[EnrichRecipeWithEnergy] = []

In [37]:
def enrich_recipes_with_ner(recipes:Recipes):
    enrich_recipes = EnrichRecipes()
    for recipe in recipes.recipes:
        ingredients_list=[]
        for item in recipe.ingredients:
            ner_response = run_openai_task(labels,item['name'])
            ner_ingredients = ner_response['function_response']
            ingredient = Ingredient(name=item['name'],
                                    quantity=item['quantity'],
                                    ner_name=ner_ingredients[0])
            ingredients_list.append(ingredient)
        enrich_recipe = EnrichRecipe(id=recipe.id,
                                    recipe_name=recipe.recipe_name,
                                    ingredients=ingredients_list,
                                    directions=recipe.directions,
                                    nutrition=recipe.nutrition,
                                    total_calories_estimation=recipe.total_calories_estimation)
        enrich_recipes.recipes.append(enrich_recipe)
    return enrich_recipes

In [41]:
enrich_recpies = enrich_recipes_with_ner(recipes=required_recipes)

 2024-04-15 22:18:00,610 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 2024-04-15 22:18:00,622 - INFO - function_to_call: <function recipe_ingredients at 0x10e492840>
 2024-04-15 22:18:00,623 - INFO - function_args: {'product': ['unsweetened almond milk']}
 2024-04-15 22:18:01,456 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 2024-04-15 22:18:01,462 - INFO - function_to_call: <function recipe_ingredients at 0x10e492840>
 2024-04-15 22:18:01,463 - INFO - function_args: {'product': ['coconut cream']}
 2024-04-15 22:18:02,035 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 2024-04-15 22:18:02,039 - INFO - function_to_call: <function recipe_ingredients at 0x10e492840>
 2024-04-15 22:18:02,040 - INFO - function_args: {'product': ['vanilla extract']}
 2024-04-15 22:18:02,725 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
 

In [42]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_ner_recipes.json', 'w') as f:
    json.dump(enrich_recpies.model_dump(mode='json'), f, indent=4, ensure_ascii=False)

In [267]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_ner_recipes.json') as f:
    gpt4_recipe_ner = json.load(f)

In [268]:
recipes_ner = EnrichRecipes(**gpt4_recipe_ner)

In [323]:
import re
def clean_string(input_string):
    # Convert to lowercase
    input_string = input_string.lower()

    # Define replacement rules
    replacements = {
        r'\bfresh\b': '',
        r'\bgrated\b': '',
        r'\bextra virgin\b': '',
        r'\bextravirgin\b': '',
        r'\bvirgin\b': '',
        r'\blowfat\b': '',
        # r'\bred\b': '',
        # r'\bgreen\b': '',
        r'\bwhite\b': '',
        r'\bwhites\b': '',
        r'\bbrown\b': '',
        r'\byellow\b': '',
        # r'\bblack\b': '',
        r'\bplain\b': '',
        r'\bclove\b': '',
        r'\bcloves\b': '',
        r'\bunsweetened\b': '',
        r'\bkalamata\b': '',
        r'\bstalks\b': '',
        r'\bleaves\b': '',
        r'\bcremini\b': '',
        r'\bslices\b': '',
        r'\bzest\b': '',
        r'\bground\b': '',
        r'\bgolden\b': '',
        r'\bzucchinis\b': 'zucchini'
    }

    # Apply replacements using regular expressions
    for pattern, replacement in replacements.items():
        input_string = re.sub(pattern, replacement, input_string)

    # Remove extra spaces by splitting and joining words
    return ' '.join(input_string.split())

In [324]:
clean_string('garlic cloves')

'garlic'

In [89]:
import requests

In [90]:
url = "https://api.nal.usda.gov/fdc/v1/foods/search?query={0}&dataType={1}&requireAllWords=true"

In [92]:
data_types = ['Foundation','SR Legacy','Survey (FNDDS)', 'Branded']
nutrient_ids = [1008,2047]

In [93]:
import os

In [94]:
headers = {
  'X-Api-Key': os.environ.get('USDA_API_KEY')
}
payload = {}

In [None]:
def exact_match_query(query:str):
    return '"'+query+'"'

In [272]:
def calculate_energy(query: str):
    clean_query = clean_string(query)
    format_query= exact_match_query(clean_query)
    if 'water' in clean_query or 'ice cubes' in clean_query:
        return USDAFoodIngredient(name=format_query,fdc_id='174158',nutrition_id='1008',unit='KCAL',energy=0.0)
    if 'tomatoes' in clean_query or 'cherry tomatoes' in clean_query:
        return USDAFoodIngredient(name=format_query,fdc_id='2543214',nutrition_id='1008',unit='KCAL',energy=22.0)
    formatted_url = url.format(format_query, data_types[0])
    ingredient = retrieve_energy(query=format_query,formatted_url=formatted_url)
    if ingredient is None:
        formatted_url = url.format(format_query, data_types[1])
        ingredient = retrieve_energy(query=format_query,formatted_url=formatted_url)
    if ingredient is None:
        formatted_url = url.format(format_query, data_types[2])
        ingredient = retrieve_energy(query=format_query,formatted_url=formatted_url)
    if ingredient is None:
        formatted_url = url.format(format_query, data_types[3])
        ingredient = retrieve_energy(query=format_query,formatted_url=formatted_url)
    return ingredient

In [95]:
response = requests.request("GET", url.format('cucumber', data_types[0]), headers=headers, data=payload)

In [96]:
resp_json = response.json()

In [97]:
resp_json

{'totalHits': 2,
 'currentPage': 1,
 'totalPages': 1,
 'pageList': [1],
 'foodSearchCriteria': {'dataType': ['Foundation'],
  'query': 'cucumber',
  'generalSearchInput': 'cucumber',
  'pageNumber': 1,
  'numberOfResultsPerPage': 50,
  'pageSize': 50,
  'requireAllWords': True,
  'foodTypes': ['Foundation']},
 'foods': [{'fdcId': 2346406,
   'description': 'Cucumber, with peel, raw',
   'commonNames': '',
   'additionalDescriptions': '',
   'dataType': 'Foundation',
   'ndbNumber': 11205,
   'publishedDate': '2022-10-28',
   'foodCategory': 'Vegetables and Vegetable Products',
   'mostRecentAcquisitionDate': '2022-05-02',
   'allHighlightFields': '',
   'score': 418.943,
   'microbes': [],
   'foodNutrients': [{'nutrientId': 1089,
     'nutrientName': 'Iron, Fe',
     'nutrientNumber': '303',
     'unitName': 'MG',
     'derivationCode': 'A',
     'derivationDescription': 'Analytical',
     'derivationId': 1,
     'value': 0.0,
     'foodNutrientSourceId': 1,
     'foodNutrientSourceCo

In [101]:
def http_get(url:str):
    return requests.request("GET", url, headers=headers, data=payload)

In [102]:
from typing import Optional
def retrieve_energy(query: str, formatted_url:str)-> Optional[Ingredient]:
    response = http_get(url=formatted_url)
    if response.status_code != 200:
        print(f'Fail to get response for ingredient:{ingredient}')
        return None
    resp_json = response.json()
    foods = resp_json['foods']
    if len(foods) == 0:
        return None
    food_details = foods[0]
    fdc_id = food_details['fdcId']    
    for item in food_details['foodNutrients']:
        if item['nutrientId'] in nutrient_ids:
            energy = item['value']
            nutrition_id = item['nutrientId']
            unit = item['unitName']
            return USDAFoodIngredient(name=query,fdc_id=fdc_id,nutrition_id=nutrition_id,unit=unit,energy=energy)
    return None

In [269]:
recipes_ner.recipes

[EnrichRecipe(id='63', recipe_name='Low-Cholesterol Vanilla Ice Cream', ingredients=[Ingredient(name='unsweetened almond milk', quantity='2 cups', ner_name='unsweetened almond milk'), Ingredient(name='coconut cream', quantity='1 cup', ner_name='coconut cream'), Ingredient(name='pure vanilla extract', quantity='2 teaspoons', ner_name='vanilla extract'), Ingredient(name='maple syrup', quantity='1/2 cup', ner_name='maple syrup'), Ingredient(name='cornstarch', quantity='2 tablespoons', ner_name='cornstarch'), Ingredient(name='salt', quantity='1/4 teaspoon', ner_name='salt')], directions="1. Mix Ingredients: In a medium saucepan, combine the almond milk, coconut cream, maple syrup, cornstarch, and salt. Whisk them together until the cornstarch is fully dissolved.\n2. Cook: Place the saucepan over medium heat. Cook the mixture, stirring constantly, until it begins to thicken slightly and just begins to bubble. Then remove from heat and stir in vanilla extract.\n3. Chill: Transfer the mixture

In [273]:
enrich_recipes_with_energy = EnrichRecipesWithEnergy()
for recipe in recipes_ner.recipes:
    ingredients_list=[]
    for item in recipe.ingredients:
        usda_food_ingredient = calculate_energy(item.ner_name)
        print(f'Engergy calculated for:{item.ner_name}')
        ingredient = IngredientWithEngergy(name=item.name,
                                           quantity=item.quantity,
                                           ner_name=item.ner_name,
                                           usda_food_ingredient=usda_food_ingredient)
        ingredients_list.append(ingredient)
    enrich_recipe = EnrichRecipeWithEnergy(id=recipe.id,
                                recipe_name=recipe.recipe_name,
                                ingredients=ingredients_list,
                                directions=recipe.directions,
                                nutrition=recipe.nutrition,
                                total_calories_estimation=recipe.total_calories_estimation)
    enrich_recipes_with_energy.recipes.append(enrich_recipe)

Engergy calculated for:unsweetened almond milk
Engergy calculated for:coconut cream
Engergy calculated for:vanilla extract
Engergy calculated for:maple syrup
Engergy calculated for:cornstarch
Engergy calculated for:salt
Engergy calculated for:zucchini
Engergy calculated for:eggplant
Engergy calculated for:yellow bell pepper
Engergy calculated for:red onion
Engergy calculated for:cherry tomatoes
Engergy calculated for:garlic cloves
Engergy calculated for:olive oil
Engergy calculated for:balsamic vinegar
Engergy calculated for:dried oregano
Engergy calculated for:dried basil
Engergy calculated for:black pepper
Engergy calculated for:feta cheese
Engergy calculated for:carrots
Engergy calculated for:celery stalks
Engergy calculated for:apples
Engergy calculated for:lemon juice
Engergy calculated for:ginger
Engergy calculated for:carrots
Engergy calculated for:parsley
Engergy calculated for:dill
Engergy calculated for:lemon zest
Engergy calculated for:lemon juice
Engergy calculated for:oliv

In [274]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_enrich_recipes.json', 'w') as f:
    json.dump(enrich_recipes_with_energy.model_dump(mode='json'), f, indent=4, ensure_ascii=False)

In [275]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_enrich_recipes.json') as f:
    gpt4_enrich_recipe = json.load(f)

In [106]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [320]:
df = pd.read_csv('../data/final_usda_ingredients.csv')

In [260]:
df

Unnamed: 0,name,modifier,portion,weight(g)
0,cucumber,,"1 slice, 1 stick, 1 medium, 1 small, 1 cup",101020065120
1,carrots,,"1 cup, 1 regular, 1 medium",1206060
2,oil,olive,"1 cup, 1 tablespoon",22414
3,lemon,,"1 slice, 1 wedge, 1 lemon",8865
4,thyme,,"1 tablespoon, 1 teaspoon","4.3,1.4"
5,spinach,,"1 cup, 1 leaf, 1 babyleaf",25100.6
6,garlic,,"1 cup, 1 clove, 1 teaspoon",13535
7,cheese,feta,"1 wedge, 1 cup, 1 cubic inch",3815017
8,milk,almond,"1 cup, 1 fl oz",24430.5
9,milk,,"1 cup, 1 fl oz",24430.5


In [276]:
FRACTIONS = {
    "½": 0.5,
    "⅓": 1 / 3,
    "⅔": 2 / 3,
    "¼": 0.25,
    "¾": 0.75,
    "⅕": 0.2,
    "⅖": 0.4,
    "⅗": 0.6,
    "⅘": 0.8,
    "⅙": 1 / 6,
    "⅚": 5 / 6,
    "⅛": 0.125,
    "⅜": 0.375,
    "⅝": 0.625,
    "⅞": 0.875,
}

In [277]:
def _extract_fractional(input_string: str) -> float:
    input_string = input_string.strip()

    # Handling mixed numbers with unicode fractions e.g., '1⅔'
    for unicode_fraction, fraction_part in FRACTIONS.items():
        if unicode_fraction in input_string:
            whole_number_part, _, _ = input_string.partition(unicode_fraction)

            whole_number = float(whole_number_part or 0)
            return whole_number + fraction_part

    if input_string in FRACTIONS:
        return FRACTIONS[input_string]

    try:
        return round(float(input_string),2)
    except ValueError:
        pass

    if " " in input_string and "/" in input_string:
        whole_part, fractional_part = input_string.split(" ", 1)
        numerator, denominator = fractional_part.split("/")
        return round(float(whole_part) + float(numerator) / float(denominator), 2)

    elif "/" in input_string:
        numerator, denominator = input_string.split("/")
        return round(float(numerator) / float(denominator), 2)

    raise ValueError(f"Unrecognized fraction format: '{input_string}'")

In [278]:
recipes=EnrichRecipesWithEnergy(**gpt4_enrich_recipe)

In [279]:
recipes.recipes

[EnrichRecipeWithEnergy(id='63', recipe_name='Low-Cholesterol Vanilla Ice Cream', ingredients=[IngredientWithEngergy(name='unsweetened almond milk', quantity='2 cups', ner_name='unsweetened almond milk', usda_food_ingredient=USDAFoodIngredient(fdc_id=2479376, name='"unsweetened almond milk"', nutrition_id=1008, energy=12.0, unit='KCAL')), IngredientWithEngergy(name='coconut cream', quantity='1 cup', ner_name='coconut cream', usda_food_ingredient=USDAFoodIngredient(fdc_id=170171, name='"coconut cream"', nutrition_id=1008, energy=357.0, unit='KCAL')), IngredientWithEngergy(name='pure vanilla extract', quantity='2 teaspoons', ner_name='vanilla extract', usda_food_ingredient=USDAFoodIngredient(fdc_id=173471, name='"vanilla extract"', nutrition_id=1008, energy=288.0, unit='KCAL')), IngredientWithEngergy(name='maple syrup', quantity='1/2 cup', ner_name='maple syrup', usda_food_ingredient=USDAFoodIngredient(fdc_id=2665764, name='"maple syrup"', nutrition_id=1008, energy=367.0, unit='KCAL')), 

In [280]:
import re

def extract_quantity_and_unit(text):
    # Using a regex pattern to match number followed by one of the specified units
    pattern = r"(\d+\.?\d*)\s*(ounces|ounce|grams|gram|oz|g)"
    match = re.search(pattern, text)
    if match:
        quantity = match.group(1)  # The numerical quantity
        unit = match.group(2)      # The unit (e.g., ounces, grams, oz, g)
        return quantity, unit
    else:
        return "NA",'NA'

# Example usage
text = "15 ounces"
quantity, unit = extract_quantity_and_unit(text)
print(f"Extracted Quantity: {quantity} {unit}")

Extracted Quantity: 15 ounces


In [281]:
def clean_hyphen(input_string):
    if '-' in input_string:
        parts = input_string.split("-")
        return parts[0]
    return input_string

In [328]:
def parse_quantity(quantity: str):
    quantity = quantity.replace('packed','').strip()
    quantity = quantity.replace('pack','').strip()
    quantity = quantity.replace('(optional)','').strip()
    quantity = quantity.replace('(sliced)','small').strip()
    quantity = quantity.replace('(minced)','clove').strip()
    quantity = quantity.replace('inch piece','inch').strip()
    quantity = ' '.join(quantity.split())
    if 'taste' in quantity:
        return 1.0, 'tablespoon'
    portion, unit = extract_quantity_and_unit(quantity)
    if portion != 'NA' and unit != 'NA':
        return portion, unit 
    parts = quantity.split(" ")
    if len(parts) == 0:
        return 'NA', 'NA'
    if len(parts) == 1:
        try:
            clean_input = clean_hyphen(parts[0])
            result = _extract_fractional(clean_input)
            if isinstance(result, float):
                return result,'NA'
        except ValueError as e:
            return 1.0, parts[0]
    if len(parts) == 2:
        clean_input = clean_hyphen(parts[0])
        return _extract_fractional(clean_input),parts[1]
    else:
        matches = re.findall(r'\d+', quantity)
        if matches:
            portion = int(matches[0])
            return portion, 'NA'
        return 'NA', 'NA'

In [329]:
parse_quantity('4 (such as Granny Smith or Honeycrisp)')

(4, 'NA')

In [283]:
from pydantic import BaseModel
from typing import  Optional
class IngredientUnitPortion(BaseModel):
    name: str
    clean_name: str
    quantity: str
    portion: str|float
    unit: str
    weight: float|str

In [284]:
from typing import  List
class IngredientEnrich(BaseModel):
    recipe_id: int
    ingredient_unit_portion:List[IngredientUnitPortion]=[]

class IngredientsEnrich(BaseModel):
    ingredients:List[IngredientEnrich]=[]

In [285]:
PLURAL_TO_SINGULAR={
    'cups': 'cup',
    'slices': 'slice',
    'pieces': 'piece',
    'teaspoons': 'teaspoon',
    'tablespoons': 'tablespoon',
    'cloves': 'clove',
    'ounces': 'ounce',
    'ozs': 'oz'
}

In [286]:
df['name'] = df['name'].str.lower()

In [287]:
def index_contaning_substring(portions:List[str], input_string:str):
    for index, item in enumerate(portions):
        parts = item.split(' ')
        if parts[1].strip() in input_string and parts[1].strip() in item:
            return index
    return -1

In [288]:
# Function to apply the substring check across the DataFrame with a query parameter
def filter_data_by_query(df, query):
    # Function to check substring conditions
    def check_plural_match(cell):
            cell_str = str(cell).lower()
            query_lower = query.lower()
            # Check for exact match or plural/singular variations
            # Assumes common pluralization by adding 's' or removing 's'
            patterns = [
                r'\b' + re.escape(query_lower) + r's?\b',  # Query potentially in plural
                r'\b' + re.escape(query_lower.rstrip('s')) + r'\b'  # Query potentially in singular
            ]
            return any(bool(re.search(pattern, cell_str)) for pattern in patterns)
    
    # Apply the function across the DataFrame
    filtered_df = df['name'].map(check_plural_match)
    return df[filtered_df]


# Apply the function across the DataFrame
query = 'tomatoes'
result_df = filter_data_by_query(df, query)

In [289]:
result_df

Unnamed: 0,name,modifier,portion,weight(g)
14,tomatoes,"can, canned","1 whole, 1 cup",110240
18,tomatoes,sundried,"1 cup, 1 piece",542
29,tomatoes,cherry,1 cup,244
40,tomatoes,,"1 slice, 1 grape tomato, 1 cherry, 1 italian t...",208176060125180
41,tomatoes,"raw, diced","1 slice, 1 grape tomato, 1 cherry, 1 italian t...",208176060125180


In [290]:
result_df[result_df['modifier'].str.contains('diced',na=False)]

Unnamed: 0,name,modifier,portion,weight(g)
41,tomatoes,"raw, diced","1 slice, 1 grape tomato, 1 cherry, 1 italian t...",208176060125180


In [312]:
def find_ingredient(df:pd.DataFrame, query:str):
    parts = query.split(' ')
    if len(parts) == 1:
        rows = filter_data_by_query(df, parts[0].strip())
        if len(rows) == 1:
            return rows
        return rows[rows['modifier'].isnull()]
    if len(parts) == 2:
        rows = filter_data_by_query(df, parts[0].strip())
        temp = rows[rows['modifier'].str.contains(parts[1].strip(),na=False)]
        if len(temp) != 0:
            return temp
        else:
            rows = filter_data_by_query(df, parts[1].strip())
            return rows[rows['modifier'].str.contains(parts[0].strip(),na=False)]
    else:
        return 'NA'
        

In [313]:
find_ingredient(df,'almond milk')

Unnamed: 0,name,modifier,portion,weight(g)
8,milk,almond,"1 cup, 1 fl oz",24430.5


In [297]:
def query_df(df:pd.DataFrame,query:str, portion:float, unit:str):
    print(query)
    if 'ounce' in unit:
        return float(portion) * 28.0
    rows = find_ingredient(df, query)
    items_found = len(rows)
    retry = 0
    for index, row in rows.iterrows():
        portions = [item.strip() for item in row['portion'].split(',')]
        index = index_contaning_substring(portions,unit)
        if index == -1:
            continue
        if retry == items_found:
            index=0
        weights = [item.strip() for item in row['weight(g)'].split(',')]
        weight = weights[index]
        actual_weight = portion * float(weight)
        if actual_weight >= 0.0:
            return actual_weight
        retry +=1
    return 'NA'

In [298]:
def calculate_weight(df: pd.DataFrame,ingredient_name: str,portion:str|float, unit:str):
    if portion == 'NA' or unit == 'NA':
        return 'NA'
    return query_df(df,ingredient_name,portion,unit)

In [299]:
calculate_weight(df, 'almond', 1.0, 'cup')

almond


141.0

In [330]:
ingredients_enrich = IngredientsEnrich(ingredients=[])
for recipe in recipes.recipes:
    ingredient_enrich = IngredientEnrich(recipe_id=recipe.id, ingredient_unit_portion=[])
    for item in recipe.ingredients:
        ner_name = item.ner_name
        quantity = item.quantity
        portion, unit = parse_quantity(quantity)
        if 'lemon' in ner_name and unit == 'NA':
            unit = 'lemon'
        elif ('stalk' in item.name or 'stalks' in item.name) and unit == 'NA' :
            unit = 'stalk'
        elif 'apple' in item.name and unit == 'NA':
            unit = 'medium' 
        elif ('egg' in item.name or 'eggs' in item.name) and unit == 'NA':
            unit = 'egg'
        elif ('zucchini' in item.name or 'zucchinis' in item.name) and unit == 'NA':
            unit = 'medium'
        elif 'mushroom' in item.name and unit == 'NA':
            unit = 'piece'
        elif 'carrot' in item.name and unit == 'NA':
            unit = 'medium'
        if unit in PLURAL_TO_SINGULAR.keys():
            unit = PLURAL_TO_SINGULAR[unit]
        clean_name = clean_string(ner_name)
        weight = calculate_weight(df,clean_name,portion, unit)
        ingredient_unit_portion = IngredientUnitPortion(name=item.name,
                                                        clean_name=clean_string(ner_name),
                                                        quantity=quantity,
                                                        portion=portion,
                                                        unit=unit,
                                                        weight=weight)
        ingredient_enrich.ingredient_unit_portion.append(ingredient_unit_portion)
    ingredients_enrich.ingredients.append(ingredient_enrich)

almond milk
coconut cream
vanilla extract
maple syrup
cornstarch
salt
zucchini
eggplant
bell pepper
red onion
cherry tomatoes
garlic
olive oil
balsamic vinegar
dried oregano
dried basil
black pepper
feta cheese
carrots
celery
apples
lemon juice
ginger
carrots
parsley
dill
lemon
lemon juice
olive oil
garlic
black pepper
olive oil
onion
garlic
carrots
zucchini
green beans
celery
vegetable broth
diced tomatoes
thyme
black pepper
spinach
apples
cinnamon
nutmeg
ginger
lemon juice
water
walnuts
raisins


In [331]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_enrich_ingredients.json', 'w') as f:
    json.dump(ingredients_enrich.model_dump(mode='json'), f, indent=4, ensure_ascii=False)

In [332]:
print(recipes)

recipes=[EnrichRecipeWithEnergy(id='63', recipe_name='Low-Cholesterol Vanilla Ice Cream', ingredients=[IngredientWithEngergy(name='unsweetened almond milk', quantity='2 cups', ner_name='unsweetened almond milk', usda_food_ingredient=USDAFoodIngredient(fdc_id=2479376, name='"unsweetened almond milk"', nutrition_id=1008, energy=12.0, unit='KCAL')), IngredientWithEngergy(name='coconut cream', quantity='1 cup', ner_name='coconut cream', usda_food_ingredient=USDAFoodIngredient(fdc_id=170171, name='"coconut cream"', nutrition_id=1008, energy=357.0, unit='KCAL')), IngredientWithEngergy(name='pure vanilla extract', quantity='2 teaspoons', ner_name='vanilla extract', usda_food_ingredient=USDAFoodIngredient(fdc_id=173471, name='"vanilla extract"', nutrition_id=1008, energy=288.0, unit='KCAL')), IngredientWithEngergy(name='maple syrup', quantity='1/2 cup', ner_name='maple syrup', usda_food_ingredient=USDAFoodIngredient(fdc_id=2665764, name='"maple syrup"', nutrition_id=1008, energy=367.0, unit='K

In [349]:
ingredients_enrich.ingredients

[IngredientEnrich(recipe_id=63, ingredient_unit_portion=[IngredientUnitPortion(name='unsweetened almond milk', clean_name='almond milk', quantity='2 cups', portion=2.0, unit='cup', weight=488.0), IngredientUnitPortion(name='coconut cream', clean_name='coconut cream', quantity='1 cup', portion=1.0, unit='cup', weight=296.0), IngredientUnitPortion(name='pure vanilla extract', clean_name='vanilla extract', quantity='2 teaspoons', portion=2.0, unit='teaspoon', weight=8.4), IngredientUnitPortion(name='maple syrup', clean_name='maple syrup', quantity='1/2 cup', portion=0.5, unit='cup', weight=157.5), IngredientUnitPortion(name='cornstarch', clean_name='cornstarch', quantity='2 tablespoons', portion=2.0, unit='tablespoon', weight=20.0), IngredientUnitPortion(name='salt', clean_name='salt', quantity='1/4 teaspoon', portion=0.25, unit='teaspoon', weight=1.5)]),
 IngredientEnrich(recipe_id=62, ingredient_unit_portion=[IngredientUnitPortion(name='zucchini', clean_name='zucchini', quantity='2 (sli

In [334]:
def retrieve_enrich_ingredient(ingredients_enrich:IngredientsEnrich, id:int):
    for item in ingredients_enrich.ingredients:
        if id == item.recipe_id:
            return item

In [369]:
from typing import List
def retrieve_ingredient_energy(ingredient_with_energy:List[IngredientWithEngergy], ingredient_name:str):
    for item in ingredient_with_energy:
        if ingredient_name == item.name:
            if item.usda_food_ingredient:
                return item.usda_food_ingredient.energy
    return 'NA'

In [337]:
retrieve_enrich_ingredient(ingredients_enrich, 63)

IngredientEnrich(recipe_id=63, ingredient_unit_portion=[IngredientUnitPortion(name='unsweetened almond milk', clean_name='almond milk', quantity='2 cups', portion=2.0, unit='cup', weight=488.0), IngredientUnitPortion(name='coconut cream', clean_name='coconut cream', quantity='1 cup', portion=1.0, unit='cup', weight=296.0), IngredientUnitPortion(name='pure vanilla extract', clean_name='vanilla extract', quantity='2 teaspoons', portion=2.0, unit='teaspoon', weight=8.4), IngredientUnitPortion(name='maple syrup', clean_name='maple syrup', quantity='1/2 cup', portion=0.5, unit='cup', weight=157.5), IngredientUnitPortion(name='cornstarch', clean_name='cornstarch', quantity='2 tablespoons', portion=2.0, unit='tablespoon', weight=20.0), IngredientUnitPortion(name='salt', clean_name='salt', quantity='1/4 teaspoon', portion=0.25, unit='teaspoon', weight=1.5)])

In [338]:
def calculate_ingredient_actual_energy(weight, energy):
    if weight == 'NA' or energy == 'NA':
        return 'NA'
    actual_energy = (float(energy)/100.0) * float(weight)
    return round(actual_energy, 2)

In [339]:
from pydantic import BaseModel
from typing import List, Optional
class RecipeWithEnergy(BaseModel):
    id: str
    name: str
    usda_calorie_estimation: float| str

class RecipesWithEnergy(BaseModel):
    recipe_with_energy: List[RecipeWithEnergy] = []

In [363]:
recipe

EnrichRecipeWithEnergy(id='31', recipe_name='Cinnamon-Spiced Baked Apples', ingredients=[IngredientWithEngergy(name='large apples', quantity='4 (such as Granny Smith or Honeycrisp)', ner_name='apples', usda_food_ingredient=USDAFoodIngredient(fdc_id=1750340, name='"apples"', nutrition_id=2047, energy=64.7, unit='KCAL')), IngredientWithEngergy(name='ground cinnamon', quantity='2 teaspoons', ner_name='ground cinnamon', usda_food_ingredient=USDAFoodIngredient(fdc_id=171849, name='"cinnamon"', nutrition_id=1008, energy=253.0, unit='KCAL')), IngredientWithEngergy(name='nutmeg', quantity='1/2 teaspoon', ner_name='nutmeg', usda_food_ingredient=USDAFoodIngredient(fdc_id=172335, name='"nutmeg"', nutrition_id=1008, energy=884.0, unit='KCAL')), IngredientWithEngergy(name='ground ginger', quantity='1/2 teaspoon', ner_name='ground ginger', usda_food_ingredient=USDAFoodIngredient(fdc_id=169231, name='"ginger"', nutrition_id=1008, energy=80.0, unit='KCAL')), IngredientWithEngergy(name='fresh lemon jui

In [380]:
def calculate_total_energy_for_recipe(id:str,recipe:EnrichRecipeWithEnergy):
    ingredient = retrieve_enrich_ingredient(ingredients_enrich, int(id))
    recipe_total_calories = 0
    for ingredient_unit_portion in ingredient.ingredient_unit_portion:
        weight = ingredient_unit_portion.weight
        energy = retrieve_ingredient_energy(recipe.ingredients,ingredient_unit_portion.name)
        actual_energy = calculate_ingredient_actual_energy(weight, energy)
        print(ingredient_unit_portion.name, actual_energy, weight)
        recipe_total_calories +=actual_energy
    recipe_with_energy = RecipeWithEnergy(id=recipe.id, name=recipe.recipe_name,usda_calorie_estimation=round(recipe_total_calories,2))
    return recipe_with_energy

In [391]:
calculate_total_energy_for_recipe('29',recipe)

carrots (grated) 97.92 240.0
fresh parsley (chopped) 5.4 15.0
fresh dill (chopped) 37.2 310.0
lemon zest 0.65 3.8
lemon juice 7.59 34.5
olive oil 101.08 28.0
garlic (minced) 4.29 3.0
black pepper 0.65 0.575


RecipeWithEnergy(id='29', name='Zesty Lemon Herb Carrot Salad', usda_calorie_estimation=254.78)

In [341]:
def get_recipe(id:str):
    for recipe in recipes.recipes:
        if id == recipe.id:
            return recipe

In [390]:
recipe = get_recipe('29')

In [346]:
recipes_with_energy = RecipesWithEnergy()
for recipe in recipes.recipes:
    ingredient = retrieve_enrich_ingredient(ingredients_enrich, int(recipe.id))
    recipe_total_calories = 0
    for ingredient_unit_portion in ingredient.ingredient_unit_portion:
        weight = ingredient_unit_portion.weight
        energy = retrieve_ingredient_energy(recipe.ingredients,ingredient_unit_portion.name)
        actual_energy = calculate_ingredient_actual_energy(weight, energy)
        recipe_total_calories +=actual_energy
    recipe_with_energy = RecipeWithEnergy(id=recipe.id, name=recipe.recipe_name,usda_calorie_estimation=round(recipe_total_calories,2))
    recipes_with_energy.recipe_with_energy.append(recipe_with_energy)

In [408]:
recipes_with_energy

RecipesWithEnergy(recipe_with_energy=[RecipeWithEnergy(id='63', name='Low-Cholesterol Vanilla Ice Cream', usda_calorie_estimation=1804.44), RecipeWithEnergy(id='62', name='Mediterranean Veggie Delight', usda_calorie_estimation=728.44), RecipeWithEnergy(id='64', name='Refreshing Garden Juice', usda_calorie_estimation=390.77), RecipeWithEnergy(id='29', name='Zesty Lemon Herb Carrot Salad', usda_calorie_estimation=254.78), RecipeWithEnergy(id='27', name='Hearty Low-Sodium Vegetable Soup', usda_calorie_estimation=461.67), RecipeWithEnergy(id='31', name='Cinnamon-Spiced Baked Apples', usda_calorie_estimation=689.81)])

In [402]:
def get_recipe_with_energy(recpie_id:str, recipes_with_energy:RecipesWithEnergy):
    for item in recipes_with_energy.recipe_with_energy:
        if item.id == recpie_id:
            return item

In [417]:
import re

def extract_ingredient_calories(text):
    """
    Extracts the lines listing ingredients and their calorie values.
    
    Args:
    text (str): Multi-line string from which to extract ingredient calorie information.
    
    Returns:
    str: Extracted lines with ingredient calorie information.
    """
    # Regex pattern to find the section with ingredient calories
    pattern = r"- Calories for each ingredient:(.*?)(?=\n- [A-Za-z]|\Z)"

    
    # Use re.DOTALL to make '.' match newlines as well
    match = re.search(pattern, text, re.DOTALL)
    
    if match:
        # Further extract individual lines with calorie information
        ingredient_lines = match.group(1).strip()
        return ingredient_lines
    else:
        return "No calorie information found."

def calculate_total_calories(ingredients_info):
    """
    Calculate the total calories, ensuring that optional ingredients and primary calorie values are included,
    but details within parentheses are excluded.

    Args:
        ingredients_info (str): String containing ingredient names followed by calorie values.

    Returns:
        int: Total calories calculated from the ingredients.
    """
    # Initialize the total calories count
    total_calories = 0

    # Process each line separately
    for line in ingredients_info.split('\n'):
        # Check if the line contains 'calories'
        if 'calories' in line:
            # Ignore details within parentheses
            cleaned_line = re.sub(r'\(.*?\)', '', line)
            # Find the first number followed by 'calories'
            match = re.search(r'(\d+)\s+calories', cleaned_line)
            if match:
                total_calories += int(match.group(1))

    return total_calories

# Sample input text
text = """
 - Total Calories for entire recipe: Approximately 560 calories for the entire recipe. 
- Calories for each ingredient:
    Apples: 380 calories (95 calories per apple)
    Cinnamon, Nutmeg, Ground Ginger: Negligible
    Lemon Juice: 4 calories per tablespoon
    Water: 0 calories
    Walnuts: 164 calories per 1/4 cup (if added)
    Golden Raisins: 108 calories per 1/4 cup (if added)
- Serving People: 4
"""

# Extracting the calorie information
extracted_calories = extract_ingredient_calories(text)
print(extracted_calories)
# Calculating total calories
total_calories = calculate_total_calories(extracted_calories)
print(f"Total Calories: {total_calories}")


Apples: 380 calories (95 calories per apple)
    Cinnamon, Nutmeg, Ground Ginger: Negligible
    Lemon Juice: 4 calories per tablespoon
    Water: 0 calories
    Walnuts: 164 calories per 1/4 cup (if added)
    Golden Raisins: 108 calories per 1/4 cup (if added)
Total Calories: 656


In [419]:
gpt4_turbo_total_calories = []
for item in recipes_ner.recipes:
    temp = {}
    temp['id'] = item.id
    extracted_calories = extract_ingredient_calories(item.total_calories_estimation)
    temp['gpt4_recipe_total_calories']  = calculate_total_calories(extracted_calories)
    gpt4_turbo_total_calories.append(temp)
    

In [420]:
total_calories_estimation = []
for item in gpt4_turbo_total_calories:
    calories_estimation = {}
    recipe_with_energy=get_recipe_with_energy(item['id'], recipes_with_energy)
    calories_estimation['id'] = recipe_with_energy.id
    calories_estimation['name'] = recipe_with_energy.name
    calories_estimation['usda_calorie_estimation'] = recipe_with_energy.usda_calorie_estimation
    calories_estimation['gpt4_calorie_estimation'] = item['gpt4_recipe_total_calories']
    total_calories_estimation.append(calories_estimation)
    


In [421]:
total_calories_estimation

[{'id': '63',
  'name': 'Low-Cholesterol Vanilla Ice Cream',
  'usda_calorie_estimation': 1804.44,
  'gpt4_calorie_estimation': 1334},
 {'id': '62',
  'name': 'Mediterranean Veggie Delight',
  'usda_calorie_estimation': 728.44,
  'gpt4_calorie_estimation': 633},
 {'id': '64',
  'name': 'Refreshing Garden Juice',
  'usda_calorie_estimation': 390.77,
  'gpt4_calorie_estimation': 318},
 {'id': '29',
  'name': 'Zesty Lemon Herb Carrot Salad',
  'usda_calorie_estimation': 254.78,
  'gpt4_calorie_estimation': 236},
 {'id': '27',
  'name': 'Hearty Low-Sodium Vegetable Soup',
  'usda_calorie_estimation': 461.67,
  'gpt4_calorie_estimation': 367},
 {'id': '31',
  'name': 'Cinnamon-Spiced Baked Apples',
  'usda_calorie_estimation': 689.81,
  'gpt4_calorie_estimation': 656}]

In [424]:
with open("../data/generated_recipe_v3/total_calorie_estimation.json", "w") as outfile: 
    json.dump(total_calories_estimation, outfile, indent=4)