In [12]:
import pandas as pd
from openai import OpenAI

client = OpenAI()

In [13]:
df = pd.read_csv("../data/data.csv")

In [14]:
documents = df.to_dict(orient="records")
documents[0]

{'id': 0,
 'recipe_name': 'Spaghetti Bolognese',
 'type_of_dish': 'Main Course',
 'main_ingredient': 'Beef',
 'cuisine': 'Italian',
 'cooking_method': 'Simmering',
 'prep_time': '10 minutes',
 'cook_time': '40 minutes',
 'instructions': 'Heat olive oil in a large pan over medium heat. Add finely chopped onions, carrots, and celery. Cook for 5-7 minutes until softened. Add minced beef and cook until browned. Pour in canned tomatoes, beef broth, and Italian herbs. Simmer uncovered for 30-40 minutes. Stir occasionally and season with salt and pepper. Serve over cooked spaghetti and garnish with fresh basil.'}

In [31]:
prompt_template = """
You emulate a user of our Cooking Assistant application.
Formulate 5 questions this user might ask based on a provided recipe.
Make the questions specific to this recipe.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record.

The record:

recipe_name: {recipe_name}
type_of_dish: {type_of_dish}
main_ingredient: {main_ingredient}
cuisine: {cuisine}
cooking_method: {cooking_method}
prep_time: {prep_time}
cook_time: {cook_time}
instructions: {instructions}

Provide the output in parsable JSON without using code blocks:

dict("questions": ["question1", "question2", ..., "question5"])
""".strip()

In [33]:
prompt = prompt_template.format(**documents[0])
print(prompt)

You emulate a user of our Cooking Assistant application.
Formulate 5 questions this user might ask based on a provided recipe.
Make the questions specific to this recipe.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as few words as possible from the record.

The record:

recipe_name: Spaghetti Bolognese
type_of_dish: Main Course
main_ingredient: Beef
cuisine: Italian
cooking_method: Simmering
prep_time: 10 minutes
cook_time: 40 minutes
instructions: Heat olive oil in a large pan over medium heat. Add finely chopped onions, carrots, and celery. Cook for 5-7 minutes until softened. Add minced beef and cook until browned. Pour in canned tomatoes, beef broth, and Italian herbs. Simmer uncovered for 30-40 minutes. Stir occasionally and season with salt and pepper. Serve over cooked spaghetti and garnish with fresh basil.

Provide the output in parsable JSON without using code blocks:

dict("questions": ["question1", "ques

In [34]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [39]:
questions = llm(prompt)

In [40]:
print(questions)

{
  "questions": [
    "How long does it take to prepare the Spaghetti Bolognese?",
    "What type of meat is used in this recipe for Bolognese?",
    "What cooking method is used for the Spaghetti Bolognese?",
    "Which vegetables should I chop for the sauce?",
    "What should I garnish the dish with before serving?"
  ]
}


In [41]:
import json

In [42]:
json.loads(questions)

{'questions': ['How long does it take to prepare the Spaghetti Bolognese?',
  'What type of meat is used in this recipe for Bolognese?',
  'What cooking method is used for the Spaghetti Bolognese?',
  'Which vegetables should I chop for the sauce?',
  'What should I garnish the dish with before serving?']}

In [43]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [44]:
from tqdm.auto import tqdm

In [45]:
results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/182 [00:00<?, ?it/s]

In [46]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [47]:
df_results = pd.DataFrame(final_results, columns=['id', 'question'])

In [48]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)


In [49]:
!head ../data/ground-truth-retrieval.csv

id,question
0,"What should I do after cooking the onions, carrots, and celery?"
0,How long should I simmer the mixture after adding the canned tomatoes?
0,What ingredients do I need to season the Bolognese sauce?
0,How long does it take to prepare the ingredients for this recipe?
0,What type of pasta is recommended to serve with the Bolognese sauce?
1,How long does it take to prepare the ingredients for the chicken curry?
1,What should I do after sautéing the onions until golden brown?
1,How long do I need to simmer the chicken curry for optimal flavor?
1,What dishes are recommended to serve alongside this chicken curry?
