In [1]:
import re

# Function to extract nutritional values
def extract_nutrition_values(text):
    # Dictionary to hold the extracted values
    nutrition_values = {}

    # Regular expressions for extracting fat, carbohydrate, and protein
    patterns = {
        'fat': r"Total Fat: (?:Approx. )?(?:Approximate )?(\d+\.?\d*g)",
        'carbohydrate': r"Total Carbohydrate: (?:Approx. )?(?:Approximate )?(\d+\.?\d*g)",
        'protein': r"Protein: (?:Approx. )?(?:Approximate )?(\d+\.?\d*g)",
    }

    # Loop through the patterns and search in the text
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            # If a match is found, add it to the dictionary
            nutrition_values[key] = match.group(1)
        else:
            # If no match is found, set the value to None
            nutrition_values[key] = None

    return nutrition_values

In [7]:
import glob
from bs4 import  BeautifulSoup
file_paths = glob.glob(pathname='../data/generated_recipe_v3/recipes/*.xml')

In [8]:
from pydantic import BaseModel
from typing import  List, Optional
class Nutrition(BaseModel):
    fat: Optional[str] = None
    protein: Optional[str] = None
    carbohydrate: Optional[str] = None

class Recipe(BaseModel):
    id: str
    recipe_name:str
    ingredients: List[dict]
    directions: str
    nutrition: Nutrition
    total_calories_estimation: str 

class Recipes(BaseModel):
    recipes: List[Recipe] = []

In [9]:
def read_xml_file(file_path: str):
    with open(file_path, 'r') as f:
        file = f.read() 
        return file

def clean_text(content: str):
    clean_content = content.strip()
    return clean_content

def clean_ingredient(content: str):
    ingredients = []
    lines = content.split('\n')
    for line in lines:
        if len(line)>1:
            temp = {}
            if ':' in line:
                tokens = line.split(':')
                temp['name'] = tokens[0].replace('-','').strip()
                temp['quantity'] = tokens[1].strip()
            else:
                print(len(line))
                temp['name'] = line.replace('-','').strip()
                temp['quantity'] = 'NA'
            ingredients.append(temp)
    return ingredients

In [10]:
from pathlib import Path
def parse_llm_recipes(file_paths:list):
    recipes = Recipes()
    for file_path in file_paths:
        print(f'Parsing file:{file_path}')
        id = Path(file_path).stem
        file = read_xml_file(file_path)
        soup = BeautifulSoup(file, 'xml')
        recipe_name = soup.find('recipe_name')
        recipe_name = clean_text(recipe_name.text)
        ingredients = soup.find('ingredients')
        ingredients = clean_ingredient(ingredients.text)
        directions = soup.find('directions')
        directions = clean_text(directions.text)
        nutrition = soup.find('nutrition')
        nutrition = clean_text(nutrition.text)
        parsed_nutrition = extract_nutrition_values(text=nutrition)
        nutrition = Nutrition(fat=parsed_nutrition['fat'], protein=parsed_nutrition['protein'],carbohydrate=parsed_nutrition['carbohydrate'])
        total_calories_estimation = soup.find('total_calories_estimation')
        total_calories_estimation = clean_text(total_calories_estimation.text)
        recipe = Recipe(id = id, 
                        recipe_name = recipe_name, 
                        ingredients = ingredients,
                        directions=directions, 
                        nutrition=nutrition,
                        total_calories_estimation=total_calories_estimation)
        recipes.recipes.append(recipe) 
    return recipes 

In [11]:
llm_recipes = parse_llm_recipes(file_paths)

Parsing file:../data/generated_recipe_v3/recipes/88.xml
Parsing file:../data/generated_recipe_v3/recipes/63.xml
Parsing file:../data/generated_recipe_v3/recipes/77.xml
Parsing file:../data/generated_recipe_v3/recipes/76.xml
Parsing file:../data/generated_recipe_v3/recipes/62.xml
Parsing file:../data/generated_recipe_v3/recipes/89.xml
Parsing file:../data/generated_recipe_v3/recipes/48.xml
Parsing file:../data/generated_recipe_v3/recipes/74.xml
Parsing file:../data/generated_recipe_v3/recipes/60.xml
Parsing file:../data/generated_recipe_v3/recipes/61.xml
Parsing file:../data/generated_recipe_v3/recipes/75.xml
Parsing file:../data/generated_recipe_v3/recipes/49.xml
Parsing file:../data/generated_recipe_v3/recipes/71.xml
Parsing file:../data/generated_recipe_v3/recipes/65.xml
Parsing file:../data/generated_recipe_v3/recipes/59.xml
Parsing file:../data/generated_recipe_v3/recipes/58.xml
Parsing file:../data/generated_recipe_v3/recipes/64.xml
Parsing file:../data/generated_recipe_v3/recipes

In [12]:
import json
with open('../data/generated_recipe_v3/gpt4_turbo_recipes.json', 'w') as f:
    json.dump(llm_recipes.model_dump(mode='json'), f, indent=4, ensure_ascii=False)