In [3]:
import os
import numpy as np
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import chromadb
from datasets import load_dataset
import json
import re




In [2]:
!pip install tf-keras

Collecting tf-keras
  Using cached tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Using cached tf_keras-2.19.0-py3-none-any.whl (1.7 MB)
Installing collected packages: tf-keras
Successfully installed tf-keras-2.19.0


In [4]:
def parse_recipe(doc: str, metadata: dict) -> dict:
    """
    Parse a recipe document into structured format.
    
    Args:
        doc (str): Raw recipe document
        metadata (dict): Recipe metadata
        
    Returns:
        dict: Parsed recipe with title, ingredients, instructions, etc.
    """
    # Extract title
    title_match = re.search(r'Title: (.*?)(?:\n|$)', doc)
    title = title_match.group(1) if title_match else metadata.get('title', '')
    
    # Extract ingredients
    ingredients_match = re.search(r'Ingredients: (.*?)(?:\n|$)', doc)
    ingredients = ingredients_match.group(1).split(', ') if ingredients_match else []
    
    # Extract instructions
    instructions_match = re.search(r'Instructions: (.*?)(?:\n|$)', doc)
    instructions = instructions_match.group(1) if instructions_match else ''
    
    # Extract NER
    ner_match = re.search(r'NER: (.*?)(?:\n|$)', doc)
    ner = ner_match.group(1) if ner_match else ''
    
    return {
        'title': title,
        'ingredients': ingredients,
        'instructions': instructions,
        'ner': ner,
        'raw_text': doc,
        'metadata': metadata
    }

def create_vector_db(test_ratio: float = 0.2):
    """
    Create vector database for training data and save test data as JSON.
    
    Args:
        test_ratio (float): Ratio of data to use for testing (default: 0.2)
    """
    # Load the dataset
    dataset = load_dataset("ashikan/diabetic-friendly-recipes")
    recipes = dataset['train']
    
    # Split into train and test
    test_size = int(len(recipes) * test_ratio)
    train_recipes = recipes.select(range(test_size, len(recipes)))
    test_recipes = recipes.select(range(test_size))
    
    # Initialize ChromaDB client for training data
    DB_PATH = "recipes_vectorstore"
    client = chromadb.PersistentClient(path=DB_PATH)
    
    # Create or get collection
    collection_name = "recipes"
    existing_collections = [collection.name for collection in client.list_collections()]
    if collection_name in existing_collections:
        client.delete_collection(collection_name)
        print(f"Deleted existing collection: {collection_name}")
    
    collection = client.create_collection(collection_name)
    
    # Initialize the sentence transformer model
    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    
    # Process training recipes for vector DB
    print("Processing training recipes for vector database...")
    documents = []
    metadatas = []
    ids = []
    
    for idx, recipe in enumerate(tqdm(train_recipes)):
        # Create a text representation of the recipe
        recipe_text = f"Title: {recipe['recipeName']}\n"
        recipe_text += f"Ingredients: {', '.join(recipe['ingredients'])}\n"
        recipe_text += f"Instructions: {recipe['steps']}\n"
        recipe_text += f"NER: {recipe['NER']}\n"
        
        metadata = {
            'title': recipe['recipeName'],
            'ingredients_count': len(recipe['ingredients']),
            'instructions_length': len(recipe['steps']),
            'serves': recipe['serves']
        }
        
        documents.append(recipe_text)
        metadatas.append(metadata)
        ids.append(str(idx))
    
    # Generate embeddings and add to collection
    batch_size = 100
    for i in tqdm(range(0, len(documents), batch_size)):
        batch_docs = documents[i:i + batch_size]
        batch_metadatas = metadatas[i:i + batch_size]
        batch_ids = ids[i:i + batch_size]
        
        # Generate embeddings
        embeddings = model.encode(batch_docs).tolist()
        
        # Add to collection
        collection.add(
            documents=batch_docs,
            embeddings=embeddings,
            metadatas=batch_metadatas,
            ids=batch_ids
        )
    
    # Process test recipes for JSON
    print("Processing test recipes for JSON...")
    test_parsed = []
    
    for recipe in tqdm(test_recipes):
        recipe_text = f"Title: {recipe['recipeName']}\n"
        recipe_text += f"Ingredients: {', '.join(recipe['ingredients'])}\n"
        recipe_text += f"Instructions: {recipe['steps']}\n"
        recipe_text += f"NER: {recipe['NER']}\n"
        
        metadata = {
            'title': recipe['recipeName'],
            'ingredients_count': len(recipe['ingredients']),
            'instructions_length': len(recipe['steps']),
            'serves': recipe['serves']
        }
        
        parsed_recipe = parse_recipe(recipe_text, metadata)
        test_parsed.append(parsed_recipe)
    
    # Save test recipes to JSON
    with open('test_recipes.json', 'w') as f:
        json.dump(test_parsed, f, indent=2)
    
    print(f"Created vector database with {len(train_recipes)} training recipes")
    print(f"Saved {len(test_recipes)} test recipes to test_recipes.json")

In [5]:
create_vector_db() 

Deleted existing collection: recipes
Processing training recipes for vector database...


100%|██████████| 575/575 [00:00<00:00, 1420.71it/s]
100%|██████████| 6/6 [01:01<00:00, 10.32s/it]


Processing test recipes for JSON...


100%|██████████| 143/143 [00:00<00:00, 1748.05it/s]


Created vector database with 575 training recipes
Saved 143 test recipes to test_recipes.json
