### Check Error

In [1]:
import pandas as pd
from io import StringIO
from tqdm import tqdm

def save_dataset_with_skip(data_file, output_file):
    skipped_rows = 0
    valid_rows = []

    with open(data_file, "r", encoding="utf-8") as file:
        total_lines = sum(1 for _ in file)
        file.seek(0)
        for line in tqdm(file, total=total_lines, desc="Processing lines"):
            try:
                row = pd.read_csv(StringIO(line), header=None)
                valid_rows.append(line)
            except pd.errors.ParserError:
                skipped_rows += 1

    print(f"Skipped rows: {skipped_rows}")

    with open(output_file, "w", encoding="utf-8") as out_file:
        out_file.writelines(valid_rows)

data_file = './recipes_data.csv'
output_file = './pure_recipes_data.csv'
save_dataset_with_skip(data_file, output_file)
print(f"Filtered dataset saved to {output_file}")

Processing lines: 100%|██████████| 6705/6705 [00:03<00:00, 2033.19it/s]

Skipped rows: 0
Filtered dataset saved to ./pure_recipes_data.csv





## Make dataset

In [3]:

import pandas as pd

df = pd.read_csv('pure_recipes_data.csv')

df.rename(columns={
    'prep_time (in mins)': 'prep_time_in_mins',
    'cook_time (in mins)': 'cook_time_in_mins',
}, inplace=True)

df['ingredients_name'] = df['ingredients_name'].fillna('')
df['ingredients_quantity'] = df['ingredients_quantity'].fillna('')

df['instruction'] = df['ingredients_name'].apply(lambda x: f'I have the following ingredients: {x}. Please provide recipes I can make with them.')


def format_output(row):
    output = f"Recipe Name: {row['name']}\n"
    output += f"Cuisine: {row['cuisine']}\n"
    output += f"Course: {row['course']}\n"
    output += f"Diet: {row['diet']}\n"
    output += "Ingredients:\n"

    ingredient_names = row['ingredients_name'].split(',')
    ingredient_quantities = row['ingredients_quantity'].split(',')

    for name, qty in zip(ingredient_names, ingredient_quantities):
        output += f"- {name.strip()}: {qty.strip()}\n"

    output += f"Preparation Time: {row['prep_time_in_mins']} mins\n"
    output += f"Cooking Time: {row['cook_time_in_mins']} mins\n"
    output += f"Instructions:\n{row['instructions']}\n"
    return output

df['output'] = df.apply(format_output, axis=1)

df['input'] = ''

df = df[['instruction', 'input', 'output']]

df.to_csv('processed_dataset.csv', index=False)


In [4]:
import pandas as pd
# 데이터 로드
df = pd.read_csv('processed_dataset.csv')

# 데이터 개수 확인
df['output']

0       Recipe Name: Poosanikai Haddu Sweet Curry Reci...
1       Recipe Name: Ayam Goreng Kremes Recipe - Indon...
2       Recipe Name: One Pot Spinach Macaroni Pasta Re...
3       Recipe Name: Dondakkai / Kovakkai Poriyal Reci...
4       Recipe Name: Himachal Pahari Style Teliah Mah ...
                              ...                        
6699    Recipe Name: Kela Anar Raita Recipe  (Banana P...
6700    Recipe Name: Strawberry Raita Recipe\nCuisine:...
6701    Recipe Name: Papad Raita Recipe\nCuisine: Indi...
6702    Recipe Name: Carrot Orange Walnut Raita (Fruit...
6703    Recipe Name: Vegan Coconut Yogurt Recipe\nCuis...
Name: output, Length: 6704, dtype: object