In [2]:
!pip install pdfplumber fitz frontend openpyxl mistralai python-dotenv


Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1


In [18]:
from dataclasses import dataclass
from __future__ import annotations
from typing import Optional, List

Creation of the Node class: 
- name of the node
- description of the node
- children of the node

In [19]:
@dataclass
class Node:
    code: str
    name: str
    desc: str
    level: int
    parent_code: Optional[str] = None
    implementation_rule: Optional[str] = None
    includes: Optional[str] = None
    includes_also: Optional[str] = None
    excludes: Optional[str] = None
    children: Optional[List[Node]] = None

    def get_name(self) -> str:
        return self.name
        
    def get_desc(self) -> str:
        return self.desc
        
    def has_children(self) -> bool:
        return self.children is not None and len(self.children) > 0
        
    def get_children(self) -> Optional[List['Node']]:
        return self.children
    
    def add_child(self, child: 'Node'):
        if self.children is None:
            self.children = []
        self.children.append(child)
    
    def count_descendants(self) -> int:
        if not self.has_children():
            return 0
        count = len(self.children)
        for child in self.children:
            count += child.count_descendants()
        return count
    
    def find_by_code(self, target_code: str) -> Optional['Node']:
        if self.code == target_code:
            return self
        if self.has_children():
            for child in self.children:
                result = child.find_by_code(target_code)
                if result:
                    return result
        return None

    def pretty(self, indent: int = 0) -> str:
        pad = "  " * indent
        s = f"{pad}{self.code}: {self.name}"
        if self.children != None: 
            for child in self.children:
                s += "\n" + child.pretty(indent + 1)
        return s


We create the graph of the NACE: https://ec.europa.eu/eurostat/documents/3859598/5902521/KS-RA-07-015-EN.PDF.pdf/dd5443f5-b886-40e4-920d-9df03590ff91?t=1414781457000

In [20]:
import requests

def download_pdf(url, filename):
    response = requests.get(url)
    response.raise_for_status()  # Check if download was successful
    
    with open(filename, 'wb') as f:
        f.write(response.content)
    
    print(f"PDF downloaded as: {filename}")
    

In [21]:
url = "https://ec.europa.eu/eurostat/documents/3859598/5902521/KS-RA-07-015-EN.PDF.pdf/dd5443f5-b886-40e4-920d-9df03590ff91?t=1414781457000"
file_path = "code_naf_grpo/NACE.pdf"
download_pdf(url, file_path)


PDF downloaded as: code_naf_grpo/NACE.pdf


We download those data from ShowVoc: https://showvoc.op.europa.eu/#/datasets/ESTAT_Statistical_Classification_of_Economic_Activities_in_the_European_Community_Rev._2.1._%28NACE_2.1%29/data 

The file we use is : NACE_Rev2.1_Structure_Explanatory_Notes_EN.xlsx

In [28]:
import openpyxl
import pandas as pd

In [29]:
class NACEBuilder():
    def __init__(self, excel_path: str):
        self.excel_path = excel_path
        self.nodes = {}
        self.root = None
    
    def build_from_excel(self, filename: str = None):
        """Build NACE graph from your Excel file"""
        file_to_use = filename if filename is not None else self.excel_path
        # Load Excel
        df = pd.read_excel(file_to_use)
        print(f"✅ Loaded {len(df)} entries")
        
        # Create all nodes
        for _, row in df.iterrows():
            node = Node(
                code=str(row['CODE']),
                name=row['NAME'],
                desc=row['NAME'],  # Using name as desc
                level=int(row['LEVEL']),
                parent_code=row['PARENT_CODE'] if pd.notna(row['PARENT_CODE']) else None,
                includes=row['Includes'] if pd.notna(row['Includes']) else None,
                includes_also=row['IncludesAlso'] if pd.notna(row['IncludesAlso']) else None,
                excludes=row['Excludes'] if pd.notna(row['Excludes']) else None,
                implementation_rule=row['Implementation_rule'] if pd.notna(row['Implementation_rule']) else None
            )
            self.nodes[node.code] = node
        
        # Build relationships
        for node in self.nodes.values():
            if node.parent_code and node.parent_code in self.nodes:
                parent = self.nodes[node.parent_code]
                parent.add_child(node)
        
        # Find root
        roots = [n for n in self.nodes.values() if not n.parent_code]
        
        if len(roots) == 1:
            self.root = roots[0]
        else:
            # Create artificial root
            self.root = Node(
                code='NACE',
                name='NACE Rev. 2.1',
                desc='Statistical Classification of Economic Activities',
                level=0
            )
            for root in roots:
                self.root.add_child(root)
        
        print(f"✅ Built graph with {len(self.nodes)} nodes")
        return self.root

In [30]:
excel_path = "code_naf_grpo/NACE_Rev2.1_Structure_Explanatory_Notes_EN.xlsx"
builder = NACEBuilder(excel_path)
nace_root = builder.build_from_excel(excel_path)

✅ Loaded 1047 entries
✅ Built graph with 1047 nodes


In [31]:
print(nace_root.pretty())

NACE: NACE Rev. 2.1
  A: AGRICULTURE, FORESTRY AND FISHING
    01: Crop and animal production, hunting and related service activities
      01.1: Growing of non-perennial crops
        01.11: Growing of cereals, other than rice, leguminous crops and oil seeds
        01.12: Growing of rice
        01.13: Growing of vegetables and melons, roots and tubers
        01.14: Growing of sugar cane
        01.15: Growing of tobacco
        01.16: Growing of fibre crops
        01.19: Growing of other non-perennial crops
      01.2: Growing of perennial crops
        01.21: Growing of grapes
        01.22: Growing of tropical and subtropical fruits
        01.23: Growing of citrus fruits
        01.24: Growing of pome fruits and stone fruits
        01.25: Growing of other tree and bush fruits and nuts
        01.26: Growing of oleaginous fruits
        01.27: Growing of beverage crops
        01.28: Growing of spices, aromatic, drug and pharmaceutical crops
        01.29: Growing of other pere

Now, we create a navigator to move into this tree

In [32]:
class TreeNavigator: 
    def __init__(self, root: Node):
        self.root = root
        self.current = root

navigator = TreeNavigator(nace_root)

In [134]:
def get_node_info() -> dict:
    """
    Return name, description, level, includes/excludes and child codes.
    """
    node = navigator.current 
    if node.children is not None : 
        result = {
            "code": node.code,
            "name": node.name,
            "desc": node.desc,
            "level": node.level,
            "parent_code": node.parent_code,
            "includes": node.includes,
            "includes_also": node.includes_also,
            "excludes": node.excludes,
            "children": [c.code for c in node.children],
        }
    else:
        result = {
            "code": node.code,
            "name": node.name,
            "desc": node.desc,
            "level": node.level,
            "parent_code": node.parent_code,
            "includes": node.includes,
            "includes_also": node.includes_also,
            "excludes": node.excludes,
            "children": None,
        }
    
    return str(result)
    
def go_down(child_code: str) -> str:
    for c in navigator.current.children:
        if c.code == child_code:
            navigator.current = c
            return f"Moved to {c.code}: {c.name}"
    raise ValueError(f"{child_code} is not a child of {navigator.current.code}")

def go_up() -> str:
    if navigator.current.parent_code is None:
        return "Already at root."
    parent = find_by_code(navigator.root, navigator.current.parent_code)
    navigator.current = parent
    return f"Moved up to {parent.code}: {parent.name}"


We register a MCP with those 3 tools

In [137]:
tools = [
    {
        "type": "function",
        "function": {
            "name": "get_node_info",
            "description": "Get details of the current node.",
            "parameters": {
                "type": "object",
                "required": [],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "go_down",
            "description": "Move to a child node by code.",
            "parameters": {
                "type": "object",
                "properties": {
                    "child_code": {"type": "string"}
                },
                "required": ["child_code"],
            },
        },
    },
    {
        "type": "function",
        "function": {
            "name": "go_up",
            "description": "Move to the parent node.",
            "parameters": {"type": "object", "properties": {}},
        },
    },
]


In [138]:
names_to_functions = {
    "get_node_info": get_node_info,   
    "go_down": go_down,               
    "go_up": go_up
}

Let connect to mistral

In [146]:
import os
from mistralai import Mistral
from dotenv import load_dotenv 
import json


load_dotenv()

api_key = os.environ["MISTRAL_API_KEY"]
model = "mistral-large-latest"

company_description = "[My company sells potatoes]"
tries_number = 500
messages = [{
        "role": "system",
        "content": (
            "You are an autonomous assistant that finds the most relevant NACE classification "
            "leaf for a given company description.\n"
            "You have three tools:\n"
            " • get_node_info(code?):  see details and children of a node.\n"
            " • go_down(child_code):   move to a child node by code.\n"
            " • go_up():               move back to the parent node.\n\n"
            "Goal: navigate the tree from the root until you reach the single best leaf node.\n"
            "If you do not ask for a tool, I will consider that your actual node correspond to the answer.`\n"
            f"You have the right to use {tries_number} times tool calls" 
        ),
    },
    {
        "role": "user",
        "content": (
            f"Company description: {company_description}\n"
            "Start from the root of the NACE tree. "
            "Use the tools step-by-step to inspect nodes and descend "
            "until you are confident you have reached the most specific leaf."
        ),
    },
]

client = Mistral(api_key=api_key)
messages

[{'role': 'system',
  'content': 'You are an autonomous assistant that finds the most relevant NACE classification leaf for a given company description.\nYou have three tools:\n • get_node_info(code?):  see details and children of a node.\n • go_down(child_code):   move to a child node by code.\n • go_up():               move back to the parent node.\n\nGoal: navigate the tree from the root until you reach the single best leaf node.\nIf you do not ask for a tool, I will consider that your actual node correspond to the answer.`\nYou have the right to use 500 times tool calls'},
 {'role': 'user',
  'content': 'Company description: [My company sells potatoes]\nStart from the root of the NACE tree. Use the tools step-by-step to inspect nodes and descend until you are confident you have reached the most specific leaf.'}]

In [None]:
for _ in range(tries_number):
    # Ask the model
    response = client.chat.complete(
        model=model,
        messages=messages,
        tools=tools,
        tool_choice="any",
        parallel_tool_calls=False,
    )

    message = response.choices[0].message
    messages.append(message)
    # If the model requested a tool
    if message.tool_calls:
        for call in message.tool_calls:
            args = json.loads(call.function.arguments or "{}")
            name = call.function.name

            if name == "get_node_info":
                result = get_node_info()
            elif name == "go_down":
                result = go_down(**args)
            elif name == "go_up":
                result = go_up()
            else:
                result = {"error": "unknown tool"}

            # Give the tool result back to the model
            messages.append({
                "role": "tool",
                "tool_call_id": call.id,
                "content": json.dumps(result)
            })
    else:
        # Model gave a final answer
        break


In [None]:
for message in messages: 
    print(message)
    print("\n \n")

In [None]:
len(messages)

Only to do some tests

In [88]:

tool_call = response.choices[0].message.tool_calls[0]
function_name = tool_call.function.name
function_params = json.loads(tool_call.function.arguments)
print("\nfunction_name: ", function_name, "\nfunction_params: ", function_params)


function_name:  get_node_info 
function_params:  {}


In [89]:
function_result = names_to_functions[function_name](**function_params)
function_result

"{'code': 'NACE', 'name': 'NACE Rev. 2.1', 'desc': 'Statistical Classification of Economic Activities', 'level': 0, 'parent_code': None, 'includes': None, 'includes_also': None, 'excludes': None, 'children': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V']}"

In [90]:
messages.append({
    "role":"tool", 
    "name":function_name, 
    "content":function_result, 
    "tool_call_id":tool_call.id
})
messages

[{'role': 'user',
  'content': "You are at the root_node which represent the NACE classification. You want to identify in whi leaf the description 'My company sells potatoes' is. You can explore the tree of the Nace classification to determine which leaf is the best."},
 AssistantMessage(content='', tool_calls=[ToolCall(function=FunctionCall(name='get_node_info', arguments='{}'), id='P5Ne8LtoQ', type=None, index=0)], prefix=False, role='assistant'),
 {'role': 'tool',
  'name': 'get_node_info',
  'content': "{'code': 'NACE', 'name': 'NACE Rev. 2.1', 'desc': 'Statistical Classification of Economic Activities', 'level': 0, 'parent_code': None, 'includes': None, 'includes_also': None, 'excludes': None, 'children': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V']}",
  'tool_call_id': 'P5Ne8LtoQ'}]

In [91]:

response = client.chat.complete(
    model = model, 
    messages = messages
)
response.choices[0].message.content

'The **NACE Rev. 2.1** classification is a hierarchical system for categorizing economic activities. Since your company **"sells potatoes"**, we need to explore the relevant sections of the NACE tree.\n\n### Likely Path:\nPotatoes are an **agricultural product**, so the most relevant section is likely:\n- **Section A: Agriculture, Forestry, and Fishing**\n  - This section includes crop production, livestock, forestry, and fishing.\n\nHowever, since your company **sells** potatoes (rather than growing them), we may also need to check:\n- **Section G: Wholesale and Retail Trade**\n  - This includes the sale of goods, including agricultural products.\n\n### Next Steps:\n1. **Explore Section A** (if your company is involved in potato farming).\n2. **Explore Section G** (if your company is primarily a trader/reseller of potatoes).\n\nWould you like me to explore **Section A** or **Section G** first? Or should I check both?'