In [2]:
import pandas as pd
import spacy
from spacy import displacy

In [13]:
nlp = spacy.load("output/model-best")

In [None]:
# Citation: https://deepgram.com/learn/turning-recipes-into-data-with-named-entity-recognition

from fractions import Fraction
import re


def fraction_to_mixed_number(fraction: Fraction) -> str:
  if fraction.numerator >= fraction.denominator:
    whole, remainder = divmod(fraction.numerator, fraction.denominator)
    if remainder == 0:
      return str(whole)
    else:
      return f"{whole} {Fraction(remainder, fraction.denominator)}"
  else:
    return str(fraction)


def convert_floats_to_fractions(text: str) -> str:
    return re.sub(
        r'\b-?\d+\.\d+\b',
        lambda match: fraction_to_mixed_number(
            Fraction(float(match.group())).limit_denominator()), text
        )
def remove_parentheses(text: str) -> str:
  return re.sub(
    r"\s*\([^)]*\)", "", text
  )


def process_text(text):
  """
  A wrapper function to pre-process text and run it through our pipeline.
  """
  return nlp(convert_floats_to_fractions(remove_parentheses(text)))

In [91]:
doc = nlp("1\/2 cup chopped fresh parsley leaves, divided)")
entitity_map = {}
for ent in doc.ents:
    if ent.label_ not in entitity_map:
        entitity_map[ent.label_] = [ent]
    else:
        entitity_map[ent.label_].append(ent)

    print(f"{ent} -> {ent.label_}")
print(entitity_map)

1\/2 -> QUANTITY
cup -> UNIT
chopped -> PROCESS
fresh -> PHYSICAL_QUALITY
parsley -> FOOD
leaves -> UNIT
divided -> PROCESS
{'QUANTITY': [1\/2], 'UNIT': [cup, leaves], 'PROCESS': [chopped, divided], 'PHYSICAL_QUALITY': [fresh], 'FOOD': [parsley]}


In [77]:
def simplify_recipe_schema(schema):
    ingredients = schema["recipeIngredient"]
    steps_raw = schema["recipeInstructions"]
    steps = []
    for i, step_dict in enumerate(steps_raw):
        steps.append({
            "order": i,
            "description": step_dict["text"],
            "image_url": step_dict["image"][0]["url"] if "image" in step_dict else None
        })

    recipe = {
        "name": schema["headline"],
        "description": schema["description"],
        "servings": schema["nutrition"]["servingSize"] 
                    if "nutrition" in schema and "servingSize" in schema["nutrition"] else None,
        "mainImageUrl": schema["image"]["url"] if "image" in schema else None,
        "ingredients": ingredients,
        "steps": steps
    }
    return recipe

In [78]:
recipes_fp = "recipes_dump.json"
recipe_df = pd.read_json(recipes_fp).to_numpy()
recipes_list = []
for recipe_list in recipe_df:
    for schema in recipe_list:
        simplified = simplify_recipe_schema(schema)
        recipes_list.append(simplified)

In [79]:
import json
with open("recipe_semi_cleaned.jsonl", "w") as file:
    for recipe in recipes_list:
        json.dump(recipe, file)
        file.write("\n")
        