### Convert to jsonl all records

In [105]:
import json
import os
import sys
import json
import numpy as np
import pandas as pd
import ast

import pandas as pd

# Load the CSV with the appropriate encoding
df = pd.read_csv("../dataset/dataset.csv", sep=";", encoding="latin1")  # or 'windows-1252'

# Select the first 500 rows
df_first_500 = df.head(500)

# Overwrite the original CSV with the first 500 rows
df_first_500.to_csv("../dataset/dataset.csv", sep=";", index=False, encoding="latin1")




### Convert to json only 500 records to reduce the loading

In [107]:
import json
import os
import sys
import json
import numpy as np
import pandas as pd
import ast

# Load the CSV
df = pd.read_csv("../dataset/dataset.csv", sep=";", encoding="latin1")  # or try 'windows-1252'

# Keep only the first 300 rows
df_300 = df.head(300)

# Convert to JSON Lines
with open("../dataset/dataset.jsonl", "w", encoding="utf-8") as f:
    for _, row in df_300.iterrows():
        # Convert each row to a dictionary and write it as a JSON line
        json_line = json.dumps(row.to_dict(), ensure_ascii=False)
        f.write(json_line + "\n")


### Convert to jsonl only 500 records to reduce the loading

In [109]:
import os
import sys
import json
import ast
import re
import pandas as pd

# Path to dataset (adjust path if needed)
DATASET = "../dataset/dataset.jsonl"
OUTPUT_DATASET = "../dataset/dataset_ok.jsonl"  # Output file path with .jsonl extension

# Row limit (optional)
NROWS = int(os.getenv("NOTEBOOK_TEST_ROW_LIMIT", str(sys.maxsize)))

# Load JSONL
raw_data = []
with open(DATASET, "r", encoding="utf-8") as f:
    for line in f:
        raw_data.append(json.loads(line))

# Apply row limit
if NROWS < len(raw_data):
    raw_data = raw_data[:NROWS]

# Convert to DataFrame
recipes_df = pd.DataFrame(raw_data)

# Keep only selected columns
columns_to_keep = [
    'Name', 'Rating Value', 'Preparation Time', 'Cooking Time',
    'Category', 'Cuisine', 'Ingredients', 'Instructions', 'Nutrition', 'URL'
]
recipes_df = recipes_df[[col for col in columns_to_keep if col in recipes_df.columns]]

# Remove duplicates and NaNs
recipes_df = recipes_df.drop_duplicates(subset=["Name"])
recipes_df = recipes_df.dropna(subset=["Name"])

# Safe eval for fields that might be stringified lists or dicts
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val) if isinstance(val, str) else val
    except Exception:
        return val

fields_to_parse = ["Category", "Cuisine", "Ingredients", "Instructions", "Nutrition"]
for field in fields_to_parse:
    if field in recipes_df.columns:
        recipes_df[field] = recipes_df[field].apply(safe_literal_eval)

# Extract ingredient names from Ingredients (list of dicts or stringified list)
def extract_ingredient_names(ingredients_val):
    if isinstance(ingredients_val, str):
        try:
            ingredients_list = ast.literal_eval(ingredients_val)
        except Exception:
            return []
    elif isinstance(ingredients_val, list):
        ingredients_list = ingredients_val
    else:
        return []

    if isinstance(ingredients_list, list):
        names = []
        for item in ingredients_list:
            if isinstance(item, dict):
                name = item.get("ingredient", "")
                if isinstance(name, str) and name.strip():
                    names.append(name.strip())
        return names
    return []

if "Ingredients" in recipes_df.columns:
    recipes_df["Ingredient_Names"] = recipes_df["Ingredients"].apply(extract_ingredient_names)
else:
    recipes_df["Ingredient_Names"] = [[] for _ in range(len(recipes_df))]

# Replace spaces with underscores in ingredient names, then join into a single string
def ingredient_names_to_text(ingredient_names_list):
    if isinstance(ingredient_names_list, list):
        processed_names = [name.replace(" ", "_") for name in ingredient_names_list if isinstance(name, str)]
        return " ".join(processed_names)
    return ""

recipes_df["Ingredient_Names_Text"] = recipes_df["Ingredient_Names"].apply(ingredient_names_to_text)

# Join lists into strings for fields used as text (except Ingredient_Names which stays as list)
list_fields_to_join = ["Category", "Cuisine", "Instructions"]
for field in list_fields_to_join:
    if field in recipes_df.columns:
        recipes_df[field] = recipes_df[field].apply(
            lambda x: " ".join(x) if isinstance(x, list) else str(x)
        )

# Rename columns to match schema field names (replace spaces with underscores)
rename_map = {
    "Rating Value": "Rating_Value",
    "Preparation Time": "Preparation_Time",
    "Cooking Time": "Cooking_Time",
    "Saturated Fat": "Saturated_Fat",
    "Unsaturated Fat": "Unsaturated_Fat",
    # Add more if needed
}
recipes_df.rename(columns=rename_map, inplace=True)

# Fill missing numeric fields with 0.0
numeric_fields = [
    "Rating_Value",
    "Preparation_Time",
    "Cooking_Time",
    "Calories",
    "Carbohydrates",
    "Cholesterol",
    "Fiber",
    "Protein",
    "Saturated_Fat",
    "Sodium",
    "Sugar",
    "Fat",
    "Unsaturated_Fat",
]

for field in numeric_fields:
    if field in recipes_df.columns:
        recipes_df[field] = pd.to_numeric(recipes_df[field], errors='coerce').fillna(0.0)

# Add 'id' column as first column with values like 'rec1', 'rec2', ...
recipes_df.insert(0, 'id', [f"rec{i+1}" for i in range(len(recipes_df))])

# --- Nutrition extraction helpers ---
def extract_float(nutrition_dict, key):
    try:
        val = nutrition_dict.get(key, "0")
        match = re.search(r"[\d.]+", str(val))
        return float(match.group(0)) if match else 0.0
    except Exception:
        return 0.0

nutrition_keys = [
    "Calories",
    "Carbohydrates",
    "Cholesterol",
    "Fiber",
    "Protein",
    "Saturated Fat",
    "Sodium",
    "Sugar",
    "Fat",
    "Unsaturated Fat",
]

if "Nutrition" in recipes_df.columns:
    for key in nutrition_keys:
        col_name = key.replace(" ", "_")
        recipes_df[col_name] = recipes_df["Nutrition"].apply(
            lambda x: extract_float(x, key) if isinstance(x, dict) else 0.0
        )
    # Optionally, keep Nutrition as a pretty-printed JSON string for display
    recipes_df["Nutrition"] = recipes_df["Nutrition"].apply(
        lambda x: json.dumps(x, ensure_ascii=False) if isinstance(x, dict) else str(x)
    )

print(f"Number of recipes loaded and cleaned: {len(recipes_df)}")

# Write the cleaned DataFrame to JSON Lines file for ingestion
recipes_df.to_json(OUTPUT_DATASET, orient="records", lines=True, force_ascii=False)
print(f"Cleaned dataset saved to {OUTPUT_DATASET}")


Number of recipes loaded and cleaned: 300
Cleaned dataset saved to ../dataset/dataset_ok.jsonl


In [110]:
pd.set_option('display.max_colwidth', None)

print(list(recipes_df.columns))
#recipes_df.head(2)
recipes_df[["Name", "Category", "Ingredients", "Ingredient_Names"]].head(2)

['id', 'Name', 'Rating_Value', 'Preparation_Time', 'Cooking_Time', 'Category', 'Cuisine', 'Ingredients', 'Instructions', 'Nutrition', 'URL', 'Ingredient_Names', 'Ingredient_Names_Text', 'Calories', 'Carbohydrates', 'Cholesterol', 'Fiber', 'Protein', 'Saturated_Fat', 'Sodium', 'Sugar', 'Fat', 'Unsaturated_Fat']


Unnamed: 0,Name,Category,Ingredients,Ingredient_Names
0,Pineapple Glaze for Ham,Dinner,"[{'ingredient': 'pineapple', 'quantity': '1', 'unit': '', 'misc': 'drained with juice reserved 1525 ounce sliced'}, {'ingredient': 'maraschino cherries', 'quantity': '1', 'unit': 'jar', 'misc': 'drained 4 ounce'}, {'ingredient': 'brown sugar', 'quantity': '1', 'unit': 'cup', 'misc': ''}]","[pineapple, maraschino cherries, brown sugar]"
1,Awesome Egg Rolls,Dinner Appetizer,"[{'ingredient': 'vegetable oil', 'quantity': '1', 'unit': 'tsp', 'misc': ''}, {'ingredient': 'egg', 'quantity': '1', 'unit': '', 'misc': 'beaten'}, {'ingredient': 'cabbage', 'quantity': '6', 'unit': 'cups', 'misc': 'shredded'}, {'ingredient': 'bean sprouts', 'quantity': '0.5', 'unit': 'cup', 'misc': 'fresh'}, {'ingredient': 'carrot', 'quantity': '1', 'unit': '', 'misc': 'shredded'}, {'ingredient': 'stalk', 'quantity': '1', 'unit': 'celery', 'misc': 'diced'}, {'ingredient': 'onion', 'quantity': '2', 'unit': 'tablespoons', 'misc': 'chopped'}, {'ingredient': 'shrimp', 'quantity': '1', 'unit': '', 'misc': 'drained 4 ounce'}, {'ingredient': 'soy sauce', 'quantity': '2', 'unit': 'tablespoons', 'misc': ''}, {'ingredient': 'garlic powder', 'quantity': '0.125', 'unit': 'teaspoon', 'misc': ''}, {'ingredient': 'pepper to taste', 'quantity': '', 'unit': 'black', 'misc': ''}, {'ingredient': 'cornstarch', 'quantity': '1', 'unit': 't', 'misc': ''}, {'ingredient': 'oil for frying', 'quantity': '', 'unit': 'vegetable', 'misc': ''}, {'ingredient': 'water', 'quantity': '2', 'unit': 't', 'misc': 'cold'}, {'ingredient': 'roll wrappers', 'quantity': '20', 'unit': 'egg', 'misc': ''}]","[vegetable oil, egg, cabbage, bean sprouts, carrot, stalk, onion, shrimp, soy sauce, garlic powder, pepper to taste, cornstarch, oil for frying, water, roll wrappers]"


In [111]:
print(recipes_df.iloc[1])

id                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

## create the categories.json

In [112]:
import json
import ast

categories = set()
cuisines = set()

with open(DATASET, "r", encoding="utf-8") as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        # Parse categories
        cat_list = ast.literal_eval(obj["Category"])
        categories.update([c.strip() for c in cat_list])
        # Parse cuisines
        cui_list = ast.literal_eval(obj["Cuisine"])
        cuisines.update([c.strip() for c in cui_list])

result = {
    "Category": sorted(categories),
    "Cuisine": sorted(cuisines),
}

with open("../dataset/categories.json", "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)


### Split loading dataset_ok.jsonl and I created more batches

In [1]:
import json
import requests

def send_batches(data, batch_size=50):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        response = requests.post(
            'http://localhost:8080/data-loader/recipe/run',
            json=batch,
            headers={'Accept': 'application/json'}
        )
        print(f"Batch {i//batch_size + 1} response:", response.status_code, response.text)

# Load your full dataset
with open('../dataset/dataset_ok.jsonl', 'r', encoding='utf-8') as f:
    data = [json.loads(line) for line in f]

send_batches(data, batch_size=50)  # Adjust batch size as needed


Batch 1 response: 202 {"result":"Background task successfully started with name: recipe"}
Batch 2 response: 202 {"result":"Background task successfully started with name: recipe"}
Batch 3 response: 202 {"result":"Background task successfully started with name: recipe"}
Batch 4 response: 202 {"result":"Background task successfully started with name: recipe"}
Batch 5 response: 202 {"result":"Background task successfully started with name: recipe"}
Batch 6 response: 202 {"result":"Background task successfully started with name: recipe"}
