In [90]:
import re

# Function to extract nutritional values
def extract_nutrition_values(text):
    # Dictionary to hold the extracted values
    nutrition_values = {}

    # Regular expressions for extracting fat, carbohydrate, and protein
    patterns = {
        'fat': r"Total Fat (\d+g)",
        'carbohydrate': r"Total Carbohydrate (\d+g)",
        'protein': r"Protein (\d+g)",
    }

    # Loop through the patterns and search in the text
    for key, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            # If a match is found, add it to the dictionary
            nutrition_values[key] = match.group(1)
        else:
            # If no match is found, set the value to None
            nutrition_values[key] = None

    return nutrition_values

In [91]:
# Sample input text
nutrition_info = """
total fat 8g 10%, 
Saturated Fat 1g 5%, 
Cholesterol 0mg 0%, 
Sodium 25mg 1%, 
Total Carbohydrate 18g 7%, 
Dietary Fiber 3g 11%, 
Total Sugars 9g, 
Protein 4g, 
Vitamin D 0mcg 0%, 
Calcium 20mg 2%, 
Iron 1mg 6%, 
Potassium 120mg 3%
"""
# Extracting the nutrition values
extracted_values = extract_nutrition_values(nutrition_info)

# Printing the results
print(extracted_values)

{'fat': '8g', 'carbohydrate': '18g', 'protein': '4g'}


In [92]:
import glob

In [93]:
file_paths = glob.glob(pathname='../data/*.xml')

In [94]:
from bs4 import  BeautifulSoup

In [95]:
with open(paths[0], 'r') as f:
    file = f.read() 

# 'xml' is the parser used. For html files, which BeautifulSoup is typically used for, it would be 'html.parser'.
soup = BeautifulSoup(file, 'xml')

In [96]:
print(soup.prettify())

<?xml version="1.0" encoding="utf-8"?>
<recipe>
 <title>
  Apple Crumb Pie
 </title>
 <ingredients>
  1 pie crust (store-bought or homemade), 6 cups thinly sliced apples (such as Granny Smith or a combination of tart and sweet apples), 3/4 cup granulated sugar, 2 tbsp all-purpose flour, 1 tsp ground cinnamon, 1/4 tsp ground nutmeg, 1/4 tsp salt, 1 tsp vanilla extract, 1/2 cup unsalted butter (cold and cut into small pieces), 3/4 cup all-purpose flour (for crumb topping), 1/2 cup brown sugar (packed, for crumb topping), 1/4 tsp baking powder (for crumb topping), 1/4 tsp salt (for crumb topping), Optional: Vanilla ice cream or whipped cream for serving
 </ingredients>
 <directions>
  Preheat the oven to 375 degrees F (190 degrees C). Place the pie crust into a 9-inch pie dish and crimp the edges as desired.
In a large bowl, combine the sliced apples, granulated sugar, 2 tbsp flour, cinnamon, nutmeg, 1/4 tsp salt, and vanilla extract. Toss until the apples are evenly coated.
Pour the appl

In [97]:
title = soup.find('title')

In [98]:
title.text.strip()

'Apple Crumb Pie'

In [99]:
ingredients = soup.find('ingredients')
ingredients.text

'\n  1 pie crust (store-bought or homemade), 6 cups thinly sliced apples (such as Granny Smith or a combination of tart and sweet apples), 3/4 cup granulated sugar, 2 tbsp all-purpose flour, 1 tsp ground cinnamon, 1/4 tsp ground nutmeg, 1/4 tsp salt, 1 tsp vanilla extract, 1/2 cup unsalted butter (cold and cut into small pieces), 3/4 cup all-purpose flour (for crumb topping), 1/2 cup brown sugar (packed, for crumb topping), 1/4 tsp baking powder (for crumb topping), 1/4 tsp salt (for crumb topping), Optional: Vanilla ice cream or whipped cream for serving\n '

In [100]:
# Original string with \n at the beginning and end, and extra spaces
original_string = '\n  1 pie crust (store-bought or homemade), 6 cups thinly sliced apples (such as Granny Smith or a combination of tart and sweet apples), 3/4 cup granulated sugar, 2 tbsp all-purpose flour, 1 tsp ground cinnamon, 1/4 tsp ground nutmeg, 1/4 tsp salt, 1 tsp vanilla extract, 1/2 cup unsalted butter (cold and cut into small pieces), 3/4 cup all-purpose flour (for crumb topping), 1/2 cup brown sugar (packed, for crumb topping), 1/4 tsp baking powder (for crumb topping), 1/4 tsp salt (for crumb topping), Optional: Vanilla ice cream or whipped cream for serving\n '

# Using .strip() to remove leading/trailing whitespaces and newline characters
cleaned_string = original_string.strip()

print(cleaned_string)

1 pie crust (store-bought or homemade), 6 cups thinly sliced apples (such as Granny Smith or a combination of tart and sweet apples), 3/4 cup granulated sugar, 2 tbsp all-purpose flour, 1 tsp ground cinnamon, 1/4 tsp ground nutmeg, 1/4 tsp salt, 1 tsp vanilla extract, 1/2 cup unsalted butter (cold and cut into small pieces), 3/4 cup all-purpose flour (for crumb topping), 1/2 cup brown sugar (packed, for crumb topping), 1/4 tsp baking powder (for crumb topping), 1/4 tsp salt (for crumb topping), Optional: Vanilla ice cream or whipped cream for serving


In [116]:
from pydantic import BaseModel
from typing import  List, Optional
class Nutrition(BaseModel):
    fat: Optional[str] = None
    protein: Optional[str] = None
    carbohydrate: Optional[str] = None

class Recipe(BaseModel):
    id: str
    recipe_name:str
    ingredients: str
    directions: str
    nutrition: Nutrition 

class Recipes(BaseModel):
    recipes: List[Recipe] = []


In [117]:
def read_xml_file(file_path: str):
    with open(file_path, 'r') as f:
        file = f.read() 
        return file

In [118]:
def clean_text(content: str):
    clean_content = content.strip()
    return clean_content

In [141]:
from pathlib import Path
def parse_llm_recipes(file_paths:list):
    recipes = Recipes()
    for file_path in file_paths:
        print(f'Parsing file:{file_path}')
        id = Path(file_path).stem
        file = read_xml_file(file_path)
        soup = BeautifulSoup(file, 'xml')
        title = soup.find('title')
        title = clean_text(title.text)
        ingredients = soup.find('ingredients')
        ingredients = clean_text(ingredients.text)
        directions = soup.find('directions')
        directions = clean_text(directions.text)
        nutrition = soup.find('nutrition')
        nutrition = clean_text(nutrition.text)
        parsed_nutrition = extract_nutrition_values(text=nutrition)
        nutrition = Nutrition(fat=parsed_nutrition['fat'], protein=parsed_nutrition['protein'],carbohydrate=parsed_nutrition['carbohydrate'])
        recipe = Recipe(id = id, recipe_name=title, ingredients=ingredients,directions=directions, nutrition=nutrition)
        recipes.recipes.append(recipe) 
    return recipes   

In [143]:
llm_recipes = parse_llm_recipes(file_paths)

Parsing file:../data/49.xml
Parsing file:../data/99.xml
Parsing file:../data/713.xml
Parsing file:../data/887.xml
Parsing file:../data/651.xml
Parsing file:../data/678.xml
Parsing file:../data/244.xml
Parsing file:../data/90.xml
Parsing file:../data/393.xml
Parsing file:../data/581.xml


In [121]:
import json
with open('../data/gpt4_1106_recipes.json', 'w') as f:
    json.dump(llm_recipes.model_dump(mode='json'), f, indent=4, ensure_ascii=False)

In [108]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [109]:
df_orginal_recipe = pd.read_csv('../data/recipes.csv')

In [111]:
df_orginal_recipe = df_orginal_recipe.rename(columns={'Unnamed: 0': 'id'})

In [115]:
df_orginal_recipe.dtypes

id                int64
recipe_name      object
prep_time        object
cook_time        object
total_time       object
servings          int64
yield            object
ingredients      object
directions       object
rating          float64
url              object
cuisine_path     object
nutrition        object
timing           object
img_src          object
dtype: object

In [140]:
df_orginal_recipe[df_orginal_recipe['id'] == 49].to_dict('records')[0]

{'id': 49,
 'recipe_name': 'Apple Crumb Pie',
 'prep_time': '30 mins',
 'cook_time': '50 mins',
 'total_time': '1 hrs 20 mins',
 'servings': 8,
 'yield': '1 9-inch pie',
 'ingredients': '6 cups thinly sliced apples, 1 tablespoon lemon juice (Optional), ¾ cup white sugar, 2 tablespoons all-purpose flour, ½ teaspoon ground cinnamon, ⅛ teaspoon ground nutmeg, ½ cup raisins (Optional), ½ cup chopped walnuts (Optional), 1 (9 inch) pie shell, ½ cup all-purpose flour, ½ cup packed brown sugar, 3 tablespoons butter',
 'directions': 'Preheat the oven to 375 degrees F (190 degrees C).\nPlace sliced apples in a large bowl; sprinkle with lemon juice. Mix white sugar, 2 tablespoons flour, cinnamon, and nutmeg together in a small bowl; sprinkle mixture over apples and toss until evenly coated. Stir in raisins and walnuts; transfer mixture into pie shell.\nMix 1/2 cup flour and brown sugar together in a small bowl. Blend in butter with a fork until mixture is crumbly; sprinkle over apple filling. Cov

In [155]:
def parse_original_recipes(file_paths: list):
    recipes = Recipes()
    for file_path in file_paths:
        id = Path(file_path).stem
        row = df_orginal_recipe[df_orginal_recipe['id'] == int(id)].to_dict('records')[0]
        recipe_name = row['recipe_name']
        ingredients = row['ingredients']
        directions = row['directions']
        nutrition = row['nutrition']
        parsed_nutrition = extract_nutrition_values(text=nutrition)
        nutrition = Nutrition(fat=parsed_nutrition['fat'], protein=parsed_nutrition['protein'],carbohydrate=parsed_nutrition['carbohydrate'])
        recipe = Recipe(id = id, recipe_name=recipe_name, ingredients=ingredients,directions=directions, nutrition=nutrition)
        recipes.recipes.append(recipe)
    return recipes 

In [156]:
orginal_recpies = parse_original_recipes(file_paths=file_paths)

In [159]:
import json
with open('../data/orginal_recipes.json', 'w') as f:
    json.dump(orginal_recpies.model_dump(mode='json'), f, indent=4, ensure_ascii=False)