In [4]:
from openai import OpenAI
import os

In [5]:
openai_client = OpenAI()

In [6]:
groq_client = OpenAI( base_url = "https://api.groq.com/openai/v1", api_key = os.environ.get("GROQ_API_KEY"))

In [7]:
response = groq_client.responses.create(
  model= "openai/gpt-oss-20b",
  input= "Write a short bedtime story about a unicorn.",
);


In [8]:
response

Response(id='resp_01kaxjgk76evx9wr66f0e3qbfz', created_at=1764076571.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='openai/gpt-oss-20b', object='response', output=[ResponseReasoningItem(id='resp_01kaxjgk76evxr50mrnf2rpa00', summary=[], type='reasoning', content=[Content(text='User wants: "Write a short bedtime story about a unicorn." We need to produce a short bedtime story about a unicorn. Probably a gentle, soothing narrative, with a unicorn, suitable for bedtime. The user didn\'t specify any other constraints. So produce a short bedtime story, likely 4-8 paragraphs, gentle tone. No disallowed content. We comply.', type='reasoning_text')], encrypted_content=None, status='completed'), ResponseOutputMessage(id='msg_01kaxjgk76evy9a99z1q06xvaj', content=[ResponseOutputText(annotations=[], text='**The Moonlit Meadow and the Gentle Unicorn**\n\nWhen the stars began to twinkle and the moon climbed high above the hills, Lily the little fox curled up in a nest 

In [5]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
documents[10]

{'text': 'It depends on your background and previous experience with modules. It is expected to require about 5 - 15 hours per week. [source1] [source2]\nYou can also calculate it yourself using this data and then update this answer.',
 'section': 'General course-related questions',
 'question': 'Course - \u200b\u200bHow many hours per week am I expected to spend on this  course?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
from minsearch import AppendableIndex

In [8]:
from minsearch import AppendableIndex

index = AppendableIndex(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.append.AppendableIndex at 0x7d6c1e1952e0>

In [9]:
question = 'I just discovered the course. Can i join it now' 

In [10]:
index.search(query=question,filter_dict={'course': 'data-engineering-zoomcamp'})

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can 

In [41]:

def search(query: str):
    """
    Search the FAQ database for entries matching the given query.

    Args:
        query (str): Search query text to look up in the course FAQ.

    Returns:
        List[Dict[str, Any]]: A list of search result entries, each containing relevant metadata.
    """
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5,
        output_ids=True
    )

    return results

In [12]:
search(query='I just discovered the course. Can i join it now')

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
  'section': 'General course-related questions',
  'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace afte

In [13]:
search_tool = {
    "type": "function",
    "name": "search",
    "description": "Search the FAQ database",
    "parameters": {
        "type": "object",
        "properties": {
            "query": {
                "type": "string",
                "description": "Search query text to look up in the course FAQ."
            }
        },
        "required": ["query"],
        "additionalProperties": False
    }
}

In [15]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

If you want to look up the answer, explain why before making the call. Use as many 
keywords from the user question as possible when making first requests.

Make multiple searches. Try to expand your search by using new keywords based on the results you
get from the search.

At the end, make a clarifying question based on what you presented and ask if there are 
other areas that the user wants to explore.
""".strip()
chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]
response = openai_client.responses.create(
  model= 'gpt-4o-mini',
  input= chat_messages,
  tools=[search_tool]
);

In [17]:
response

Response(id='resp_0b7a3c517100453600692581e39b7c819399046df4584113fd', created_at=1764065763.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"Can I join the course now?"}', call_id='call_J00U3J6GqKKQZuZ4RYfxPj30', name='search', type='function_call', id='fc_0b7a3c517100453600692581e4c13081939a12b4f987ea3c13', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the FAQ database')], top_p=1.0, background=False, conversation=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, prompt_cache_reten

In [55]:
import json
call = response.output[0]

results = search("join course")
result_json = json.dumps(results, indent=2)

chat_messages.append({
    "type": "function_call_output",
    "call_id": call.id,
    "output": result_json,
})

In [59]:
chat_messages

[{'role': 'user',
  'content': 'I just discovered the course. Can i join it now'},
 {'type': 'function_call_output',
  'call_id': 'resp_01kax4qcvbfr1v4hmmebkp6cfq',
  'output': '[\n  {\n    "text": "Yes, even if you don\'t register, you\'re still eligible to submit the homeworks.\\nBe aware, however, that there will be deadlines for turning in the final projects. So don\'t leave everything for the last minute.",\n    "section": "General course-related questions",\n    "question": "Course - Can I still join the course after the start date?",\n    "course": "data-engineering-zoomcamp"\n  },\n  {\n    "text": "The purpose of this document is to capture frequently asked technical questions\\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \\u201cOffice Hours\'\' live.1\\nSubscribe to course public Google Calendar (it works from Desktop only).\\nRegister before the course starts using this link.\\nJoin the course Telegram channel wi

In [20]:
import json
def make_call(call):
    args = json.loads(call.arguments)
    f_name = call.name
    f = globals()[f_name]
    result = f(**args)
    result_json = json.dumps(result, indent=2)
    return {
        "type": "function_call_output",
        "call_id": call.call_id,
        "output": result_json,
    }

In [21]:
for entry in response.output:
    chat_messages.append(entry)

    if entry.type == 'message':
        print('assistant')
        print(entry.content[0].text)

    if entry.type == 'function_call':
        print('function call:', entry.name, entry.arguments)
        result = make_call(entry)
        print('results:', result['output'][:50], '...')
        chat_messages.append(result)

    print()

function call: search {"query":"Can I join the course now?"}
results: [
  {
    "text": "Yes, even if you don't register ...



In [24]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

If you want to look up the answer, explain why before making the call. Use as many 
keywords from the user question as possible when making first requests.

Make multiple searches. Try to expand your search by using new keywords based on the results you
get from the search.

At the end, make a clarifying question based on what you presented and ask if there are 
other areas that the user wants to explore.
""".strip()
chat_messages = [
    {"role": "developer", "content": developer_prompt},
    {"role": "user", "content": question}
]

response = openai_client.responses.create(
    model='gpt-4o-mini',
    input=chat_messages,
    tools=[search_tool]
)

In [25]:
response

Response(id='resp_02c2a5e270f8fc4e0069258289ee348198970bcf105a4ece3a', created_at=1764065929.0, error=None, incomplete_details=None, instructions=None, metadata={}, model='gpt-4o-mini-2024-07-18', object='response', output=[ResponseFunctionToolCall(arguments='{"query":"Can I join the course now"}', call_id='call_361e6pD3r3ZHQIubAs4I1eLK', name='search', type='function_call', id='fc_02c2a5e270f8fc4e006925828ba3488198a92963f59619841a', status='completed')], parallel_tool_calls=True, temperature=1.0, tool_choice='auto', tools=[FunctionTool(name='search', parameters={'type': 'object', 'properties': {'query': {'type': 'string', 'description': 'Search query text to look up in the course FAQ.'}}, 'required': ['query'], 'additionalProperties': False}, strict=True, type='function', description='Search the FAQ database')], top_p=1.0, background=False, conversation=None, max_output_tokens=None, max_tool_calls=None, previous_response_id=None, prompt=None, prompt_cache_key=None, prompt_cache_retent

In [30]:
developer_prompt = """
You're a course teaching assistant. 
You're given a question from a course student and your task is to answer it.

If you want to look up the answer, explain why before making the call. Use as many 
keywords from the user question as possible when making first requests.

Make multiple searches. Try to expand your search by using new keywords based on the results you
get from the search.

At the end, make a clarifying question based on what you presented and ask if there are 
other areas that the user wants to explore.
""".strip()
chat_messages = [
    {"role": "developer", "content": developer_prompt}
]

client = openai_client
tools = [

while True: # main Q&A loop
    question = input() # How do I do my best for module 1?
    if question == 'stop':
        break

    message = {"role": "user", "content": question}
    chat_messages.append(message)

    while True: # request-response loop - query API till get a message
        response = client.responses.create(
            model='gpt-4o-mini',
            input=chat_messages,
            tools=tools
        )

        has_tool_calls = False
        
        for entry in response.output:
            chat_messages.append(entry)
        
            if entry.type == 'function_call':      
                print('function_call:', entry)
                print()
                result = make_call(entry)
                chat_messages.append(result)
                has_tool_calls = True

            elif entry.type == 'message':
                print(entry.content[0].text)
                print()

        if not has_tool_calls:
            break

 I just discovered the course. Can i join it now


NameError: name 'tools' is not defined

In [31]:
from toyaikit.llm import OpenAIClient
from toyaikit.tools import Tools
from toyaikit.chat import IPythonChatInterface
from toyaikit.chat.runners import OpenAIResponsesRunner
from toyaikit.chat.runners import DisplayingRunnerCallback

In [32]:
agent_tools = Tools()
agent_tools.add_tool(search)

chat_interface = IPythonChatInterface()

runner = OpenAIResponsesRunner(
    tools=agent_tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    llm_client=OpenAIClient()
)


In [34]:
result = runner.run()

You: HOw dO i install kafka?


-> Response received


-> Response received


-> Response received


You: STOP


Chat ended.


In [35]:
result.cost

CostInfo(input_cost=Decimal('0.00048795'), output_cost=Decimal('0.000297'), total_cost=Decimal('0.00078495'))

In [36]:
pip install pydantic-ai

Collecting pydantic-ai
  Downloading pydantic_ai-1.22.0-py3-none-any.whl.metadata (14 kB)
Collecting pydantic-ai-slim==1.22.0 (from pydantic-ai-slim[ag-ui,anthropic,bedrock,cli,cohere,evals,fastmcp,google,groq,huggingface,logfire,mcp,mistral,openai,retries,temporal,ui,vertexai]==1.22.0->pydantic-ai)
  Downloading pydantic_ai_slim-1.22.0-py3-none-any.whl.metadata (6.2 kB)
Collecting griffe>=1.3.2 (from pydantic-ai-slim==1.22.0->pydantic-ai-slim[ag-ui,anthropic,bedrock,cli,cohere,evals,fastmcp,google,groq,huggingface,logfire,mcp,mistral,openai,retries,temporal,ui,vertexai]==1.22.0->pydantic-ai)
  Downloading griffe-1.15.0-py3-none-any.whl.metadata (5.2 kB)
Collecting opentelemetry-api>=1.28.0 (from pydantic-ai-slim==1.22.0->pydantic-ai-slim[ag-ui,anthropic,bedrock,cli,cohere,evals,fastmcp,google,groq,huggingface,logfire,mcp,mistral,openai,retries,temporal,ui,vertexai]==1.22.0->pydantic-ai)
  Downloading opentelemetry_api-1.38.0-py3-none-any.whl.metadata (1.5 kB)
Collecting pydantic-graph

In [37]:
search_tool

{'type': 'function',
 'name': 'search',
 'description': 'Search the FAQ database',
 'parameters': {'type': 'object',
  'properties': {'query': {'type': 'string',
    'description': 'Search query text to look up in the course FAQ.'}},
  'required': ['query'],
  'additionalProperties': False}}

In [47]:
from pydantic_ai import Agent
search_agent = Agent(
    model='openai:gpt-4o-mini',
    instructions=developer_prompt,
    tools=[search],
)

In [48]:
result = await search_agent.run(user_prompt=question)

In [49]:
print(result.output)

Yes, you can still join the Data Engineering Zoomcamp course, even though it may have already started. You are able to start learning and submitting homework without needing to officially register, as the submissions aren't checked against a registered list. However, it's important to keep in mind that there are deadlines for turning in final projects, so you should try to stay on top of those.

To get the most out of the course, I recommend registering through the provided links as soon as possible. This will allow you to access materials and stay updated with any announcements related to the course.

If you have any specific concerns or areas related to joining the course you'd like to delve deeper into, please let me know! Is there anything specific regarding materials, deadlines, or getting started that you want to discuss further?


In [50]:
result.usage()

RunUsage(input_tokens=2850, output_tokens=199, details={'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, requests=3, tool_calls=2)

In [52]:
calc_price(result.udate(),'gpt-4o-mini','openai')

NameError: name 'calc_price' is not defined

In [53]:
from toyaikit.chat.runners import PydanticAIRunner

In [55]:
runner = PydanticAIRunner(
        chat_interface=chat_interface,
        agent=search_agent)

In [56]:
await runner.run()

You: hOw tO install kafka?


AttributeError: 'list' object has no attribute 'replace'

In [57]:
def add_entry(question: str, answer: str) -> None:
    """
    Add a new entry to the FAQ database.

    Args:
        question (str): The question to be added to the FAQ database.
        answer (str): The corresponding answer to the question.
    """
    doc = {
        'question': question,
        'text': answer,
        'section': 'user added',
        'course': 'data-engineering-zoomcamp'
    }
    index.append(doc)

In [58]:
agent_tools.add_tool(add_entry)

In [59]:
chat_interface = IPythonChatInterface()

runner = OpenAIResponsesRunner(
    tools=agent_tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    llm_client=OpenAIClient()
)

In [60]:
result = runner.run();

You: HOW DO I DO WELL IN MODULE 1?


-> Response received


-> Response received


-> Response received


You: DOCKER


-> Response received


-> Response received


You: add this back tO FAQ


-> Response received


-> Response received


You: STOP


Chat ended.


In [61]:
index.docs[-1]

{'question': 'How do I do well with Docker in Module 1?',
 'text': "1. **Set Up Your Environment**: Ensure Docker is properly installed and configured on your machine. Follow the course instructions precisely to set up the Docker environment.\n\n2. **Familiarize with Docker Commands**: Learn about essential commands like `docker build`, `docker run`, and `docker-compose`. Understanding how to use these commands is crucial for managing your Docker containers effectively.\n\n3. **Resolve Common Errors**: Be prepared to troubleshoot common issues. For example, if you see an error like `ModuleNotFoundError: No module named 'psycopg2'`, ensure that you install the required package using:\n   ```bash\n   pip install psycopg2-binary\n   ```\n\n4. **Use Dockerfile and docker-compose.yml**: Understand how to write and modify your Dockerfile and docker-compose.yaml files. These files define the environment and dependencies necessary for your application.\n\n5. **Check Logs for Debugging**: If yo

In [62]:
index.search("How do i do in Module 1")

[{'text': 'Problem description\nProject structure:\n/sources/production/model_service.py\n/sources/tests/unit_tests/test_model_service.py (“from production.model_service import ModelService)\ngit commit -t ‘test’ raises ‘No module named ‘production’’ when calling pytest hook\n- repo: local\nhooks:\n- id: pytest-check\nname: pytest-check\nentry: pytest\nlanguage: system\npass_filenames: false\nalways_run: true\nargs: [\n"tests/"\n]\nSolution description\nUse this hook instead:\n- repo: local\nhooks:\n- id: pytest-check\nname: pytest-check\nentry: "./sources/tests/unit_tests/run.sh"\nlanguage: system\ntypes: [python]\npass_filenames: false\nalways_run: true\nAnd make sure that run.sh sets the right directory and run pytest:\ncd "$(dirname "$0")"\ncd ../..\nexport PYTHONPATH=.\npipenv run pytest ./tests/unit_tests\nAdded by MarcosMJD',
  'section': 'Module 6: Best practices',
  'question': 'Pytest error ‘module not found’ when using pre-commit hooks if using custom packages in the source 

In [65]:
from typing import List, Dict, Any
class SearchTools:

    def __init__(self, index):
        self.index = index

    def search(self, query: str) -> List[Dict[str, Any]]:
        """
        Search the FAQ database for entries matching the given query.
    
        Args:
            query (str): Search query text to look up in the course FAQ.
    
        Returns:
            List[Dict[str, Any]]: A list of search result entries, each containing relevant metadata.
        """
        boost = {'question': 3.0, 'section': 0.5}
    
        results = self.index.search(
            query=query,
            filter_dict={'course': 'data-engineering-zoomcamp'},
            boost_dict=boost,
            num_results=5,
            output_ids=True
        )
    
        return results

    def add_entry(self, question: str, answer: str) -> None:
        """
        Add a new entry to the FAQ database.
    
        Args:
            question (str): The question to be added to the FAQ database.
            answer (str): The corresponding answer to the question.
        """
        doc = {
            'question': question,
            'text': answer,
            'section': 'user added',
            'course': 'data-engineering-zoomcamp'
        }
        self.index.append(doc)

In [67]:
search_tools = SearchTools(index)

agent_tools = Tools()
agent_tools.add_tools(search_tools)

In [68]:
from toyaikit.tools import get_instance_methods

In [70]:
get_instance_methods(search_tools)

[<bound method SearchTools.add_entry of <__main__.SearchTools object at 0x7d6c19ae9a30>>,
 <bound method SearchTools.search of <__main__.SearchTools object at 0x7d6c19ae9a30>>]

In [2]:
chat_interface = IPythonChatInterface()

runner = OpenAIResponsesRunner(
    tools=agent_tools,
    developer_prompt=developer_prompt,
    chat_interface=chat_interface,
    llm_client=OpenAIClient()
)

NameError: name 'IPythonChatInterface' is not defined

In [3]:
runner.run()

NameError: name 'runner' is not defined