In [None]:
# Data Download from kaggle
import kagglehub
path = kagglehub.dataset_download("rumitpathare/indian-recipes")
print("Path to dataset files:", path)

In [None]:
# Move Food Recipe Data file to current folder
!cp  /home/tj/.cache/kagglehub/datasets/rumitpathare/indian-recipes/versions/22/Food_Recipe.csv ./recipes_data.csv

In [None]:
# Exclude rows with errors
import pandas as pd
from io import StringIO
from tqdm import tqdm


data_file = './recipes_data.csv'
output_file = './pure_recipes_data.csv'

skipped_rows = 0
valid_rows = []

with open(data_file, "r", encoding="utf-8") as file:
    total_lines = sum(1 for _ in file)
    file.seek(0)
    for line in tqdm(file, total=total_lines, desc="Processing lines"):
        try:
            row = pd.read_csv(StringIO(line), header=None)
            valid_rows.append(line)
        except pd.errors.ParserError:
            skipped_rows += 1

print(f"Skipped rows: {skipped_rows}")

with open(output_file, "w", encoding="utf-8") as out_file:
    out_file.writelines(valid_rows)

print(f"Filtered dataset saved to {output_file}")

In [None]:
# Data preprocessing
import pandas as pd
import json
import regex as re


df = pd.read_csv('pure_recipes_data.csv')

# 1. clean up column names
df.rename(columns={
    'prep_time (in mins)': 'prep_time_in_mins',
    'cook_time (in mins)': 'cook_time_in_mins',
}, inplace=True)

# 2. fill with blank if no value or error exists
df['ingredients_name'] = df['ingredients_name'].fillna('')
df['ingredients_quantity'] = df['ingredients_quantity'].fillna('')

# 3. recognise only English, numbers and special characters as valid values in regular expressions
valid_pattern = re.compile(r'^[\p{Latin}\p{Nd}\p{P}\s]*$')
def is_row_valid(row):
    for col in ['name', 'cuisine', 'course', 'diet', 'ingredients_name', 'ingredients_quantity', 'instructions']:
        if not re.match(valid_pattern, str(row[col])):
            return False
    return True

# 4. save only valid values with regular expression
df = df[df.apply(is_row_valid, axis=1)]

# 5. Create an 'instruction' column - assumes user input
df['instruction'] = df['ingredients_name'].apply(
    lambda x: f'I have the following ingredients: {x}. Please provide recipes I can make with them.'
)

# 6. create an 'output' column - parsing JSON for Google, Youtube search and organising food recipes
def format_output(row):
    json_header = json.dumps({"is_recipe_request": True, "recipe_name": row['name']})
    output = f"{json_header}\n"
    output += f"Recipe Name: {row['name']}\n"
    output += f"Cuisine: {row['cuisine']}\n"
    output += f"Course: {row['course']}\n"
    output += f"Diet: {row['diet']}\n"
    output += "Ingredients:\n"

    ingredients = row['ingredients_name'].split(',')
    quantities = row['ingredients_quantity'].split('  ')

    cleaned_ingredients = []
    for idx, (name, qty) in enumerate(zip(ingredients, quantities), start=1):
        cleaned_name = name.strip()
        cleaned_qty = qty.strip()
        if cleaned_name and cleaned_qty:
            cleaned_ingredients.append(f"{idx}. {cleaned_name}: {cleaned_qty}")

    output += "\n".join(cleaned_ingredients) + "\n"

    output += "Instructions:\n"
    instructions = re.split(r'[.!?]\s*', row['instructions'])  # Separate by sentence
    instructions = [instr.strip() for instr in instructions if instr.strip()]  # Remove empty sentences

    for idx, instruction in enumerate(instructions, start=1):
        output += f"{idx}. {instruction}.\n"

    return output

# 7. create output, input column - Disable input columns
df['output'] = df.apply(format_output, axis=1)
df['input'] = ''

# 9. 3 columns to use for training the model - regulated by the UNSLOTH rule
df = df[['instruction', 'input', 'output']]

# 10. save as CSV
df.to_csv('processed_dataset.csv', index=False, quotechar='"', escapechar="\\")