# One Knowledge Base to Rule Them All Workflow

## 0. Setup

In [1]:
import requests


def get_auth_headers(email: str, password: str) -> dict[str, str]:
    """Get auth headers for the selected user."""
    supabase_auth_url = "https://sb.stack-ai.com"
    anon_key = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJzdXBhYmFzZSIsInJlZiI6ImZic3VhZGZxaGtseG9rbWxodHNkIiwicm9sZSI6ImFub24iLCJpYXQiOjE2NzM0NTg5ODAsImV4cCI6MTk4OTAzNDk4MH0.Xjry9m7oc42_MsLRc1bZhTTzip3srDjJ6fJMkwhXQ9s"

    request_url = f"{supabase_auth_url}/auth/v1/token?grant_type=password"
    response = requests.post(
        request_url,
        json={
            "email": email,
            "password": password,
            "gotrue_meta_security": {},
        },
        headers={
            "Content-Type": "application/json",
            "Apikey": anon_key,
        },
        timeout=10,
    )
    response.raise_for_status()
    access_token = response.json()["access_token"]

    headers = {"Authorization": f"Bearer {access_token}"}

    return headers

### Login to your account to get your auth headers

In [2]:
email = "stackaitest@gmail.com"
password = input(f"Introduce the password for {email}: ")

auth_headers = get_auth_headers(email, password)

### Create a request session

In [3]:
session = requests.Session()

session.headers.update(auth_headers)

### Set the correct url for the backend you want to use

In [4]:
backend_url = "https://api.stack-ai.com"

In [5]:
org_id = session.get(f"{backend_url}/organizations/me/current").json()["org_id"]

## 1. Connections

### 1.1 Create a Google Drive connection in the Stack AI Workflow builder

1. Go to the Stack AI Workflow builder
2. On the left sidebar, click on Knowledge Bases
3. Drop the Google Drive node on the canvas
4. Click on connect to Google Drive on the node and follow the authorization steps.

### 1.2 List all the connections for the selected user


Your newly created connection will be listed here

In [6]:
connection_list_url = f"{backend_url}/connections?connection_provider=gdrive&limit=1"
resource = session.get(connection_list_url)

resource.raise_for_status()

connection = resource.json()[0]

In [7]:
print("Connection information:")
print("----------------------")
print(f"Connection ID: {connection['connection_id']}")
print(f"Connection name: {connection['name']}")
print(f"Created at: {connection['created_at']}")
print(f"Updated at: {connection['updated_at']}")

# Commented to avoid leaking sensitive information
# print(f"Connection provider: {connection['connection_provider_data']}")

Connection information:
----------------------
Connection ID: 96891794-4313-42f1-9d98-237e526165b8
Connection name: Google Drive
Created at: 2025-06-19T02:28:05.881189+00:00
Updated at: 2025-08-26T22:28:26.399868+00:00


### 1.3 List available resources under the connection


:warning: The responses from this endpoint are paginated! :warning:

The response has the following fields:
- `data`: `list[T]` A list of resources.
- `next_cursor`: `str | None` The cursor to use to fetch the next page of resources if there is one.
- `current_cursor`: `str | None` The cursor to use to re-fetch the current page of resources.

In [8]:
connection_id = connection["connection_id"]
connection_resources_url = f"{backend_url}/connections/{connection_id}/resources"
children_resources_url = f"{backend_url}/connections/{connection_id}/resources/children"

**Root resources** 

Lets start with the root resources, to do it, we should not specify a path, so we will get all the resources in the connection.

In [9]:
print("Pinging: ", children_resources_url)
root_resources_response = session.get(children_resources_url)

root_resources_response.raise_for_status()

root_resources = root_resources_response.json()

for resource in root_resources['data']:
    emoji = "📁" if resource["inode_type"] == "directory" else "📄"

    print(f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']})")

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children
📁 acme                           (resource_id: 1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf)
📁 books                          (resource_id: 14fYmcLp_T4jV5bSL2CZiezOY6a6vGW0d)
📁 clients                        (resource_id: 1xPb4DwqeCcsppG7qAULiiXugOvovMX70)
📁 projects                       (resource_id: 10aWbQss04gCh9kJCuxYCYteyXnsd6Nzu)
📁 references                     (resource_id: 1dKlaJIxEb9ENFEufLutmsT7QqsfOwYpm)
📄 Copy of ACME_Earnings_Report_Q2_2024.pdf (resource_id: 1TJ3bD80IYFixw4Col514XxquZrDzFiPr)
📄 Copy of ACME_Inc_Customer_Data.csv (resource_id: 11WIQPhVbGTH3oCynCKt_7wG0_2-06KpG)
📄 Copy of ACME_Information_Security_Policies.pdf (resource_id: 1Rlbkh6yA1VG97Gv1aiBhVeB5Mv6s6Ncm)
📄 Copy of ACME_Investment_Committee_Memo_Q3_2024.pdf (resource_id: 1izCa2twODi8mLZt73-SUQxxC0lA-a6PV)
📄 Copy of ACME_Knowledge_Base_RFP_Responses.pdf (resource_id: 1hv9rdtMa-Uu0xMbigver3mE8hH1hrhUy)
📄 rootfile1.

**Lets take a look at the raw response from the API**

In [10]:
for resource in root_resources['data']:
    print(resource)

{'knowledge_base_id': '00000000-0000-0000-0000-000000000000', 'created_at': '2025-08-26T22:17:57.691000Z', 'modified_at': '2025-08-26T22:17:57.691000Z', 'indexed_at': None, 'inode_type': 'directory', 'resource_id': '1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf', 'inode_path': {'path': 'acme'}, 'dataloader_metadata': {}, 'user_metadata': {}, 'inode_id': None}
{'knowledge_base_id': '00000000-0000-0000-0000-000000000000', 'created_at': '2025-08-26T22:17:57.688000Z', 'modified_at': '2025-08-26T22:17:57.688000Z', 'indexed_at': None, 'inode_type': 'directory', 'resource_id': '14fYmcLp_T4jV5bSL2CZiezOY6a6vGW0d', 'inode_path': {'path': 'books'}, 'dataloader_metadata': {}, 'user_metadata': {}, 'inode_id': None}
{'knowledge_base_id': '00000000-0000-0000-0000-000000000000', 'created_at': '2025-08-26T22:17:57.707000Z', 'modified_at': '2025-08-26T22:17:57.707000Z', 'indexed_at': None, 'inode_type': 'directory', 'resource_id': '1xPb4DwqeCcsppG7qAULiiXugOvovMX70', 'inode_path': {'path': 'clients'}, 'dataloader_

In [11]:

from urllib.parse import urlencode


def get_specific_file(resource_id: str, resources_url: str) -> None:
    data = {"resource_id": resource_id}

    # Encode the query parameters
    encoded_query_params = urlencode(data, doseq=True)
    url = f"{resources_url}?{encoded_query_params}"

    print("Pinging: ", url)
    response = session.get(url)

    response.raise_for_status()

    resources = response.json().get("data", [])

    if len(resources) == 0:
        print("No resources found")
        return

    if isinstance(resources, dict):
        resources = [resources]

    for response in resources:
        emoji = "📁" if response["inode_type"] == "directory" else "📄"
        print(f"{emoji} {response['inode_path']['path']:30} (resource_id: {response['resource_id']})")

    print("\n\nRaw response:")
    print(response)

**Get the resources in a directory, like Papers**


In [12]:
# This is the resource_id of the folder 'papers'.
# Replace it with the resource_id of the file you want to get the information from.
resource_id = "19Bvjgw4w6LdltkjZ7yL685xtmjPXWX7d"
get_specific_file(resource_id=resource_id, resources_url=children_resources_url)

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children?resource_id=19Bvjgw4w6LdltkjZ7yL685xtmjPXWX7d
No resources found


**Get the resources in a directory, like papers/another folder (nested)**



In [13]:
resource_id = "1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf"
get_specific_file(resource_id=resource_id, resources_url=children_resources_url)

Pinging:  https://api.stack-ai.com/connections/96891794-4313-42f1-9d98-237e526165b8/resources/children?resource_id=1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf
📄 acme/ACME_Earnings_Report_Q2_2024.pdf (resource_id: 1Bt__g7WyvLLJBRy4zNjtGV-d2Xse4u4T)
📄 acme/ACME_Inc_Customer_Data.csv (resource_id: 1jJyxf4EM-f6gOTNVglnqUYTf-jhqE44X)
📄 acme/ACME_Information_Security_Policies.pdf (resource_id: 1iGlwCE-f_gzeMWegMxQrSwBoNzfJI7t5)
📄 acme/ACME_Investment_Committee_Memo_Q3_2024.pdf (resource_id: 11CFxi9sFJaevWB3dNvo6K7Om-HgrJN01)
📄 acme/ACME_Knowledge_Base_RFP_Responses.pdf (resource_id: 1Q9iEkA-BjnwTaZ3JFkQn10y_QJgbtqIl)


Raw response:
{'knowledge_base_id': '00000000-0000-0000-0000-000000000000', 'created_at': '2025-08-26T22:18:05.270000Z', 'modified_at': '2024-08-07T23:53:55Z', 'indexed_at': None, 'inode_type': 'file', 'resource_id': '1Q9iEkA-BjnwTaZ3JFkQn10y_QJgbtqIl', 'inode_path': {'path': 'acme/ACME_Knowledge_Base_RFP_Responses.pdf'}, 'dataloader_metadata': {'last_modified_at': '2024-08-07T23:53:55+

# 2. Knowledge Bases

Once the user has decided which resources they want to index, they can create a knowledge base. A knowledge base is a collection of resources that are indexed in our vector database. 


In this example, we will suppose that the user has decided to index the following resources:
- 📁 papers                         (resource_id: 17nmGKUBjR_djw4SHiEMqmqb67vAH1uST)
- 📄 Very Important notes.txt       (resource_id: 1wWBg9mJkWFJUbEdRjjjkX4jf7TYmE__GRRfAjSh6fzs)


This means that `papers` and all of its subfolders will be indexed as well as the `manu_document_awesome.txt` file will be indexed.

It is important that the frontend contains logic to avoid passing both a resource and its children in the list of resources to be indexed. For example, if the frontend passses both
- 📁 test_folder                    (resource_id: 1cGeHFazvfHDSOfDJ_SRZEzkm5q1-Zn41)
- 📄 test_folder/test_file.pdf (resource_id: 18nr8ZUE0QQZgNITw1JeEV1ZaobMDxUNC)

While the backend will work fine and index everything under `test_folder`, there will be duplicate work to get the metadata of the `test_file.pdf` file both as a child of `test_folder` and as an independent resource.

## 2.1 Creating a knowledge base
Lets create a knowledge base that will be synced to the selected resources.

In [14]:
import json

create_kb_url = f"{backend_url}/knowledge_bases"


connection_source_ids = [
    "1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf",  # The Papers folder
    "19Bvjgw4w6LdltkjZ7yL685xtmjPXWX7d",  # Very Important notes.txt file
]


data = {
    "connection_id": connection_id,
    "connection_source_ids": connection_source_ids,
    "indexing_params": {
        "ocr": False,
        "unstructured": True,
        "embedding_params": {"embedding_model": "text-embedding-ada-002", "api_key": None},
        "chunker_params": {"chunk_size": 1500, "chunk_overlap": 500, "chunker": "sentence"},
    },
    "org_level_role": None,
    "cron_job_id": None,
}

print("Pinging: ", create_kb_url)
kb_create_response = session.post(create_kb_url, data=json.dumps(data))

new_kb_json = kb_create_response.json()
print(new_kb_json)

knowledge_base_id = new_kb_json["knowledge_base_id"]

Pinging:  https://api.stack-ai.com/knowledge_bases
{'knowledge_base_id': 'bebf67df-6c4f-4ba6-b93c-db60e930f981', 'connection_id': '96891794-4313-42f1-9d98-237e526165b8', 'created_at': '2025-08-29T08:21:36.502237Z', 'updated_at': '2025-08-29T08:21:36.502243Z', 'connection_source_ids': ['1UdRAmc-fBWRfYY4Z7XHhi_-tVDJsmGkf', '19Bvjgw4w6LdltkjZ7yL685xtmjPXWX7d'], 'website_sources': [], 'connection_provider_type': 'gdrive', 'is_empty': True, 'total_size': 0, 'name': 'Unnamed Knowledge Base', 'description': '', 'indexing_params': {'ocr': False, 'unstructured': True, 'embedding_params': {'api': None, 'base_url': None, 'embedding_model': 'text-embedding-ada-002', 'provider': None, 'batch_size': 300, 'track_usage': True, 'timeout': 5}, 'chunker_params': {'chunk_size': 1500, 'chunk_overlap': 500, 'chunker_type': 'sentence'}}, 'cron_job_id': None, 'org_id': '0d582f36-52dd-403f-a38a-ccf4dfa06180', 'org_level_role': None, 'user_metadata_schema': None, 'dataloader_metadata_schema': None}


In [15]:
kb_sync_url = f"{backend_url}/knowledge_bases/sync/trigger/{knowledge_base_id}/{org_id}"

print("Pinging: ", kb_sync_url)
sync_response = session.get(kb_sync_url)

print(sync_response.status_code)
print(sync_response.text)

Pinging:  https://api.stack-ai.com/knowledge_bases/sync/trigger/bebf67df-6c4f-4ba6-b93c-db60e930f981/0d582f36-52dd-403f-a38a-ccf4dfa06180
200
null


## 2.3 Get the list of files in the knowledge base

At first, the files will be in the pending state as their indexing is not yet complete. If you wait for about a minute, you should see the files in the indexed state.


:warning: The responses from this endpoint are paginated! :warning:

The response has the following fields:
- `data`: `list[T]` A list of resources.
- `next_cursor`: `str | None` The cursor to use to fetch the next page of resources if there is one.
- `current_cursor`: `str | None` The cursor to use to re-fetch the current page of resources.

In [31]:
def print_kb_resources(data: dict, knowledge_base_id: str) -> None:
    kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources/children"

    encoded_query_params = urlencode(data)
    url = f"{kb_children_resources_url}?{encoded_query_params}"
    print("Pinging: ", url)
    response = session.get(url, data=json.dumps(data))

    response.raise_for_status()

    resources = response.json().get("data", [])

    if len(resources) == 0:
        print("No resources found")

    if isinstance(resources, dict):
        resources = [resources]

    for resource in resources:
        # print(resource)
        emoji = "📁" if resource["inode_type"] == "directory" else "📄"
        print(
            f"{emoji} {resource['inode_path']['path']:30} (resource_id: {resource['resource_id']}) status: {resource.get('status')}"
        )

In [39]:
data = { "resource_path": "/", }
print_kb_resources(data, knowledge_base_id)

Pinging:  https://api.stack-ai.com/knowledge_bases/bebf67df-6c4f-4ba6-b93c-db60e930f981/resources/children?resource_path=%2F
📁 acme                           (resource_id: STACK_VFS_VIRTUAL_DIRECTORY) status: None
📁 papers                         (resource_id: STACK_VFS_VIRTUAL_DIRECTORY) status: None


In [33]:
data = {
    "resource_path": "/acme",
}
print_kb_resources(data, knowledge_base_id)

Pinging:  https://api.stack-ai.com/knowledge_bases/bebf67df-6c4f-4ba6-b93c-db60e930f981/resources/children?resource_path=%2Facme
📄 acme/ACME_Earnings_Report_Q2_2024.pdf (resource_id: 1Bt__g7WyvLLJBRy4zNjtGV-d2Xse4u4T) status: indexed
📄 acme/ACME_Inc_Customer_Data.csv (resource_id: 1jJyxf4EM-f6gOTNVglnqUYTf-jhqE44X) status: indexed
📄 acme/ACME_Information_Security_Policies.pdf (resource_id: 1iGlwCE-f_gzeMWegMxQrSwBoNzfJI7t5) status: indexed
📄 acme/ACME_Investment_Committee_Memo_Q3_2024.pdf (resource_id: 11CFxi9sFJaevWB3dNvo6K7Om-HgrJN01) status: indexed
📄 acme/ACME_Knowledge_Base_RFP_Responses.pdf (resource_id: 1Q9iEkA-BjnwTaZ3JFkQn10y_QJgbtqIl) status: indexed


In [None]:
data = {
    "resource_path": "acme/another folder",
}
print_kb_resources(data, knowledge_base_id)

Pinging:  https://api.stack-ai.com/knowledge_bases/bebf67df-6c4f-4ba6-b93c-db60e930f981/resources/children?resource_path=papers%2Fanother+folder


HTTPError: 400 Client Error: Bad Request for url: https://api.stack-ai.com/knowledge_bases/bebf67df-6c4f-4ba6-b93c-db60e930f981/resources/children?resource_path=papers%2Fanother+folder

## 2.4 Manually manipulate the knowledge base

### Delete a file
For now, only files can be deleted.

In [27]:
kb_children_resources_url = f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources"

data = {
    "resource_path": "rootfiple1.txt",
}
encoded_query_params = urlencode(data)
resource = session.delete(
    f"{kb_children_resources_url}?{encoded_query_params}",
    data=json.dumps(data),
)


print(resource.status_code)

204


In [None]:
import time

# Wait for the deletion to finish
time.sleep(5)

data = {
    "resource_path": "papers/",
}
print_kb_resources(data, knowledge_base_id)

### Create a file
For now, only files can be created.

In [40]:
# Define the metadata and file content
payload = {
    "resource": {
        "provider": "gdrive",
        "inode_path": {
            "path": "rootfile6.txt"
        },
        "type": "file"
    }
}
# Make the POST request
resource = session.post(
    f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources",
    json=payload
)
print(resource)

<Response [422]>


In [38]:
# Define the metadata and file content
create_request_metadata = {
        "resource_type": "file",
        "resource_path": "papers/papers/demo_file.txt",
}
file_content = b"test file content"

# Prepare the files dictionary
files = {
    "file": ("file.txt", file_content, "text/plain"),
}

# Make the POST request
resource = session.post(
    f"{backend_url}/knowledge_bases/{knowledge_base_id}/resources",
    files=files,
    data=create_request_metadata,  # Use data instead of json for multipart form-data
)
# 14fYmcLp_T4jV5bSL2CZiezOY6a6vGW0d
print(resource.status_code)

202


In [None]:
import time

time.sleep(5)

data = {
    "resource_path": "papers/papers/",
}
print_kb_resources(data, knowledge_base_id)
