# Variables

In [1]:
OPENAI_API_KEY='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'
FIRECRAWL_API_KEY='fc-e36282d62d1c4ee8971ec0649f3862c7'
BASE_URL='https://api.groq.com/openai/v1'
MODEL="meta-llama/llama-4-scout-17b-16e-instruct"

# Import Modules

In [2]:
from typing import TypedDict, Annotated
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langgraph.graph import StateGraph, START, END
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage, AIMessage
from langchain.agents import create_tool_calling_agent, AgentExecutor
from pydantic import BaseModel, Field
from langchain.tools import Tool
from firecrawl import FirecrawlApp
import pandas as pd

# Testing

In [3]:
df_task = pd.read_csv("/mnt/g/Ai agents course/LangGraph/Challenge2/data/ISE - AutoCode Challenge 2 Public - Public task.csv")
df_model = pd.read_csv("/mnt/g/Ai agents course/LangGraph/Challenge2/data/ISE - AutoCode Challenge 2 Public - Model zoo.csv")

In [4]:
task = df_task["Task"][2]

In [5]:
def format_model_list(df):
    model_strings = []
    for i, row in df.iterrows():
        desc = str(row[1]).replace("{", "{{").replace("}", "}}")
        url = str(row[2])
        model_strings.append(f"{i + 1} - {desc} [More info]({url})")
    return "\n".join(model_strings)


## Tools

In [6]:
def web_scraper(url: str) -> str:
    """Get more details about the model using the model url"""
    scraper = FirecrawlApp(api_key=FIRECRAWL_API_KEY)
    scrape_status = scraper.scrape_url(
        url,
        formats=['markdown']
    )
    return scrape_status.markdown

In [7]:
class WebScraperParameters(BaseModel):
    url: str

In [8]:
res = web_scraper('https://huggingface.co/thanhtlx/image_classification_01')

In [9]:
tools = [
    Tool(
        name="web_scrapper",
        func=web_scraper,
        description="Use to get more detail about the model",
        args_schema=WebScraperParameters
    )
]

## Model

In [None]:
llm = ChatOpenAI( 
    base_url=BASE_URL,
    model=MODEL
)

In [11]:
class Tasks(BaseModel):
    subtask_one: str = Field(
        ...,
        description="Detailed description of task 1"
    )
    subtask_two: str = Field(
        ...,
        description="Detailed description of task 2"
    )
    subtask_three: str = Field(
        ...,
        description="Detailed description of task 3"
    )

In [12]:
class AgentState(TypedDict):
    task: HumanMessage | None
    data: str | None
    messages: Annotated[BaseMessage, add_messages]
    subtasks: Tasks

In [13]:
def classify_agent(state: AgentState) -> AgentState:
    structured_llm = llm.with_structured_output(Tasks)
    system_prompt = """
                    You are an expert in task formulation. Given a machine learning task, you will
                    help divide the task into these subtasks:
                    - Subtask1: for classifying the problem type and task description
                    - Subtask2: for identifying the input of the problem
                    - Subtask3: for identifying the output of the problem
                    describe the each subtask as specifically as possible
                    """
    messages = [SystemMessage(content=system_prompt)]
    messages.append(state['task'])
    result = structured_llm.invoke(messages)
    state['subtasks'] = result
    return state

In [14]:
state = {
    "task": HumanMessage(content=task),
}

In [15]:
state = classify_agent(state)

In [None]:
def modeling_agent(state: AgentState) -> AgentState:
    system_prompt = f"""
    You are an expert in machine learning. Given a task, choose the best model
    from this list:

    {format_model_list(df_model)}

    Then, use tools to get more information about the selected model
    (description, input format, output format, sample code, ...).
    When done, summarize everything in your final answer.
    """

    prompt = ChatPromptTemplate.from_messages([
        ("system", system_prompt),
        ("ai", "{agent_scratchpad}"),
        ("human", "{input}")
    ])

    agent = create_tool_calling_agent(llm=llm, tools=tools, prompt=prompt)
    executor = AgentExecutor(agent=agent, tools=tools, verbose=True)

    task_input = state["task"].content if isinstance(state["task"], SystemMessage) else state["task"]

    result = executor.invoke({"input": task_input})

    return {"output": result["output"]}

In [None]:
res = modeling_agent(state)

  desc = str(row[1]).replace("{", "{{").replace("}", "}}")
  url = str(row[2])




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `web_scrapper` with `{'url': 'https://huggingface.co/zhaospei/Model_3'}`
responded: To solve this problem, we need to select a suitable model from the given list and then gather more information about the selected model.

The task involves regression to predict the median house value of areas in California based on features like median income, average house age, average number of rooms, etc. The goal is to categorize the house prices into three groups: "low", "medium", and "high".

Given the nature of the task, which involves regression and classification based on multiple features, we should look for models that can handle regression tasks and potentially be adapted for classification or have a straightforward way to output categorical values.

Let's consider the models:

1. Nhận diện ảnh chữ viết tay - Not suitable for this task.
2. Phân loại văn bản thành các topic - Not suitable for this task.
3. Dự đoán giá tr

In [None]:
import pprint
pprint.pprint(res)

{'output': 'Based on the task description, I will select Model3 since it is '
           'directly relevant to predicting the median house value in '
           'California districts.\n'
           '\n'
           "Let's summarize the information about Model3:\n"
           '\n'
           '**Model3: Dự đoán giá nhà ở các quận Cali (MLP)**\n'
           '\n'
           '*   **Task**: Dự đoán giá nhà dựa trên các đặc trưng đầu vào dạng '
           'bảng (tabular regression).\n'
           '*   **Input**:\n'
           '    *   Số chiều: `[batch_size, 8]`\n'
           '    *   Kiểu dữ liệu: `torch.FloatTensor`\n'
           '    *   Các đặc trưng:\n'
           '        *   `MedInc` – Thu nhập trung vị\n'
           '        *   `HouseAge` – Tuổi trung bình của căn nhà\n'
           '        *   `AveRooms` – Số phòng trung bình\n'
           '        *   `AveBedrms` – Số phòng ngủ trung bình\n'
           '        *   `Population` – Dân số\n'
           '        *   `AveOccup` – Số ngư

In [None]:
print(res["output"])


Based on the task description, I will select Model3 since it is directly relevant to predicting the median house value in California districts.

Let's summarize the information about Model3:

**Model3: Dự đoán giá nhà ở các quận Cali (MLP)**

*   **Task**: Dự đoán giá nhà dựa trên các đặc trưng đầu vào dạng bảng (tabular regression).
*   **Input**:
    *   Số chiều: `[batch_size, 8]`
    *   Kiểu dữ liệu: `torch.FloatTensor`
    *   Các đặc trưng:
        *   `MedInc` – Thu nhập trung vị
        *   `HouseAge` – Tuổi trung bình của căn nhà
        *   `AveRooms` – Số phòng trung bình
        *   `AveBedrms` – Số phòng ngủ trung bình
        *   `Population` – Dân số
        *   `AveOccup` – Số người trung bình trên mỗi hộ
        *   `Latitude` – Vĩ độ
        *   `Longitude` – Kinh độ
*   **Output**:
    *   Kiểu: `torch.FloatTensor` có shape `[batch_size, 1]`
    *   Ý nghĩa: Một giá trị thực duy nhất dự đoán giá nhà trung bình cho mỗi mẫu.

To adapt Model3 for the specific requireme