### Check Error

In [1]:
import pandas as pd
from io import StringIO
from tqdm import tqdm

def save_dataset_with_skip(data_file, output_file):
    skipped_rows = 0
    valid_rows = []

    with open(data_file, "r", encoding="utf-8") as file:
        total_lines = sum(1 for _ in file)
        file.seek(0)
        for line in tqdm(file, total=total_lines, desc="Processing lines"):
            try:
                row = pd.read_csv(StringIO(line), header=None)
                valid_rows.append(line)
            except pd.errors.ParserError:
                skipped_rows += 1

    print(f"Skipped rows: {skipped_rows}")

    with open(output_file, "w", encoding="utf-8") as out_file:
        out_file.writelines(valid_rows)

data_file = './recipes_data.csv'
output_file = './pure_recipes_data.csv'
save_dataset_with_skip(data_file, output_file)
print(f"Filtered dataset saved to {output_file}")

Processing lines: 100%|██████████| 6705/6705 [00:03<00:00, 2033.19it/s]

Skipped rows: 0
Filtered dataset saved to ./pure_recipes_data.csv





## Make dataset

In [1]:

import pandas as pd

df = pd.read_csv('pure_recipes_data.csv')
df = df.head(20)

df.rename(columns={
    'prep_time (in mins)': 'prep_time_in_mins',
    'cook_time (in mins)': 'cook_time_in_mins',
}, inplace=True)

df['ingredients_name'] = df['ingredients_name'].fillna('')
df['ingredients_quantity'] = df['ingredients_quantity'].fillna('')


df['instruction'] = df['ingredients_name'].apply(lambda x: f'I have the following ingredients: {x}. Please provide recipes I can make with them.')

def format_output(row):
    output = f"Recipe Name: {row['name']}\n"
    output += f"Cuisine: {row['cuisine']}\n"
    output += f"Course: {row['course']}\n"
    output += f"Diet: {row['diet']}\n"
    output += "Ingredients:\n"

    ingredient_names = row['ingredients_name'].split(',')
    ingredient_quantities = row['ingredients_quantity'].split(',')

    for name, qty in zip(ingredient_names, ingredient_quantities):
        output += f"- {name.strip()}: {qty.strip()}\n"

    output += f"Instructions:\n{row['instructions']}\n"
    return output

df['output'] = df.apply(format_output, axis=1)

df['input'] = ''

df = df[['instruction', 'input', 'output']]

df.to_csv('test_processed_dataset.csv', index=False)


In [1]:
import pandas as pd
# 데이터 로드
df = pd.read_csv('processed_dataset.csv')
len(df)

6704

In [31]:
import re

df_new = pd.DataFrame()

# NER, directions 컬럼에서 대괄호 모두 제거
df['NER'] = df['NER'].str.replace('[', '')
df['NER'] = df['NER'].str.replace(']', '')
df['NER'] = df['NER'].str.replace('"', '')
# 정규 표현식을 사용하여 모든 종류의 쌍따옴표 제거
df['directions'] = df['directions'].apply(lambda x: re.sub(r'["\"""]', '', str(x)))
# 대괄호 제거
df['directions'] = df['directions'].str.replace('[', '').str.replace(']', '')


# 정규 표현식을 사용하여 모든 종류의 쌍따옴표 제거
df['ingredients'] = df['ingredients'].apply(lambda x: re.sub(r'["\"""]', '', str(x)))
# 대괄호 제거
df['ingredients'] = df['ingredients'].str.replace('[', '').str.replace(']', '')


df_new['instruction'] = df['NER'].apply(lambda x: f'I have the following ingredients: {x}. Please provide recipes I can make with them.')
# input열은 빈칸
df_new['input'] = ''

# df['output] 만들기
def format_output(row):
    output = f"Recipe Name: {row['title']}\n"
    output += "Ingredients:\n"

    # ingredients 컬럼에서 리스트 안 요소들 하나씩 추가하기
    ingredients = row['ingredients'].split(',')
    for ingredient in ingredients:
        output += f"- {ingredient.strip()}\n"


    output += f"Instructions:\n{row['directions']}\n"
    return output

df_new['output'] = df.apply(format_output, axis=1)

# df_new = df_new.head(20)
df_new.to_csv('2m_processed_dataset.csv', index=False)


In [11]:
df_new.head(1)

Unnamed: 0,instruction,input,output
0,"I have the following ingredients: [""bite size ...",,"[""In a heavy 2-quart saucepan, mix brown sugar..."
