In [7]:
import os
from notion_client import Client
from helpers import get_driver

driver = get_driver()
notion = Client(auth=os.getenv("NOTION_API_KEY"))

In [8]:
from pprint import pprint

pprint(notion.search(query="", page_size=2))

{'has_more': True,
 'next_cursor': '07c54a9c-8302-47e9-a323-1738c6f18836',
 'object': 'list',
 'page_or_database': {},
 'request_id': '9b87a972-6047-4fed-be7f-54284c8acddb',
 'results': [{'archived': False,
              'cover': None,
              'created_by': {'id': '9530c9e2-8f8b-47e4-809f-c3731d5a6035',
                             'object': 'user'},
              'created_time': '2022-06-18T09:50:00.000Z',
              'icon': {'emoji': '🧑🏻\u200d🎤', 'type': 'emoji'},
              'id': '2cf4a74a-c06e-43ae-a8b7-78cb4697d59d',
              'in_trash': False,
              'last_edited_by': {'id': '9530c9e2-8f8b-47e4-809f-c3731d5a6035',
                                 'object': 'user'},
              'last_edited_time': '2024-05-08T07:06:00.000Z',
              'object': 'page',
              'parent': {'type': 'workspace', 'workspace': True},
              'properties': {'title': {'id': 'title',
                                       'title': [{'annotations': {'bold': False,
 

In [9]:
def process_paginated(endpoint_method, **kwargs):
    next_cursor = None
    while True:
        response = endpoint_method(start_cursor=next_cursor, **kwargs)
        for result in response["results"]:
            yield result
        has_more = response.get("has_more", False)
        if not has_more:
            return
        next_cursor = response["next_cursor"]

all_pages = list(process_paginated(notion.search, query=""))
len(all_pages)

920

In [10]:
all_pages[-1]

{'object': 'page',
 'id': 'f828f0a4-d462-445f-ad0d-0bbc883517bc',
 'created_time': '2020-08-25T08:15:00.000Z',
 'last_edited_time': '2020-08-25T08:15:00.000Z',
 'created_by': {'object': 'user',
  'id': '7c74a801-7cb5-4914-bbbe-2b18cd1ced76'},
 'last_edited_by': {'object': 'user',
  'id': '7c74a801-7cb5-4914-bbbe-2b18cd1ced76'},
 'cover': None,
 'icon': None,
 'parent': {'type': 'database_id',
  'database_id': '64adf49c-9bde-4afd-8440-0ca58f82ecbd'},
 'archived': False,
 'in_trash': False,
 'properties': {'Tags': {'id': "!'(w",
   'type': 'multi_select',
   'multi_select': []},
  'URL': {'id': 'BGD%7B', 'type': 'url', 'url': None},
  'Music': {'id': '_%3BEo', 'type': 'rich_text', 'rich_text': []},
  'Created': {'id': '%7D%25j%7B',
   'type': 'created_time',
   'created_time': '2020-08-25T08:15:00.000Z'},
  'Name': {'id': 'title',
   'type': 'title',
   'title': [{'type': 'text',
     'text': {'content': 'Daily Entry', 'link': None},
     'annotations': {'bold': False,
      'italic': Fa

In [16]:
import sys
sys.path.append("../")

In [20]:
# Object types: page, database, block, user
# General properties: id, created_time, last_edited_time, text, full_text
# Links: parent, created_by, last_edited_by

import json

from knowledge_bridge.models import NodeEntity

# Collect all data
nodes = {}
edges = set()

def process_block(block):
    print("Processing block", block["id"])
    if block["type"] == "child_page":
        page = notion.pages.retrieve(block["id"])
        process_page(page)
        return
    
    if block["type"] == "child_database":
        database = notion.databases.retrieve(block["id"])
        process_database(database)
        return
    
    node = NodeEntity(
        id=block["id"],
        type="block",
        created=block["created_time"],
        edited=block["last_edited_time"],
        # Dump all content as json
        text=json.dumps(block[block["type"]]),
        link=None
    )
    nodes[node.id] = node

    parent = block.get("parent")
    if parent and parent["type"] != "workspace":
        edge = (parent[parent["type"]], node.id, "child_block")
        edges.add(edge)

    if block.get("has_children"):
        children = process_paginated(notion.blocks.children.list, block_id=block["id"])
        for child in children:
            process_block(child)


def process_page(page):
    print("Processing page", page["id"])
    node = NodeEntity(
        id=page["id"],
        type="page",
        created=page["created_time"],
        edited=page["last_edited_time"],
        # Dump all content as json
        text=json.dumps(page["properties"]),
        link=page["url"],
    )
    nodes[node.id] = node

    parent = page.get("parent")
    if parent and parent["type"] != "workspace":
        edge = (parent[parent["type"]], node.id, "child_page")
        edges.add(edge)

    blocks = process_paginated(notion.blocks.children.list, block_id=page["id"])

    for block in blocks:
        process_block(block)


def process_database(database):
    print("Processing database", database["id"])
    node = NodeEntity(
        id=database["id"],
        type="database",
        created=database["created_time"],
        edited=database["last_edited_time"],
        # Dump all content as json
        text=json.dumps(database["properties"]),
        link=None
    )
    nodes[node.id] = node

    parent = database.get("parent")
    if parent and parent["type"] != "workspace":
        edge = (parent[parent["type"]], node.id, "child_database")
        edges.add(edge)

    pages = process_paginated(notion.databases.query, database_id=database["id"])

    for page in pages:
        process_page(page)

In [21]:
for page in all_pages:
    process_page(page)

Processing page 2cf4a74a-c06e-43ae-a8b7-78cb4697d59d
Processing block 4f53078a-0ad7-407b-9417-81448e6bf93c
Processing page 4f53078a-0ad7-407b-9417-81448e6bf93c
Processing block c4f16327-da7a-45fc-9ba7-1ff33b736335
Processing page c4f16327-da7a-45fc-9ba7-1ff33b736335
Processing block c5168ec5-68ac-474c-a083-12418642d340
Processing block 7d5c14a5-7893-405d-b3bc-88ded6edc3f5
Processing page 7d5c14a5-7893-405d-b3bc-88ded6edc3f5
Processing block 24d2ddfc-aa7f-4022-899e-fa3674fe0bf5
Processing block 56ab2533-a58a-4dd8-abe1-ea0fd03ce51b
Processing block 3c603145-137b-43a9-ae25-3f3b650ddb03
Processing page 3c603145-137b-43a9-ae25-3f3b650ddb03
Processing block 9584cb94-5aa4-40e4-bc0a-e61e7a6c19fa
Processing block 0f6e5333-8f11-4e93-b9fe-810a4082567a
Processing page 0f6e5333-8f11-4e93-b9fe-810a4082567a
Processing block f8d9c8e5-b6ba-49d3-a66d-2b191464a17b
Processing block 8b97cd88-9eba-4dc0-bda6-15269ff0ab33
Processing page 8b97cd88-9eba-4dc0-bda6-15269ff0ab33
Processing block 23542238-7c16-494e