In [7]:
import os
from notion_client import Client
from helpers import get_driver

driver = get_driver()
notion = Client(auth=os.getenv("NOTION_API_KEY"))

In [8]:
from pprint import pprint

pprint(notion.search(query="", page_size=2))

{'has_more': True,
 'next_cursor': '07c54a9c-8302-47e9-a323-1738c6f18836',
 'object': 'list',
 'page_or_database': {},
 'request_id': '9b87a972-6047-4fed-be7f-54284c8acddb',
 'results': [{'archived': False,
              'cover': None,
              'created_by': {'id': '9530c9e2-8f8b-47e4-809f-c3731d5a6035',
                             'object': 'user'},
              'created_time': '2022-06-18T09:50:00.000Z',
              'icon': {'emoji': '🧑🏻\u200d🎤', 'type': 'emoji'},
              'id': '2cf4a74a-c06e-43ae-a8b7-78cb4697d59d',
              'in_trash': False,
              'last_edited_by': {'id': '9530c9e2-8f8b-47e4-809f-c3731d5a6035',
                                 'object': 'user'},
              'last_edited_time': '2024-05-08T07:06:00.000Z',
              'object': 'page',
              'parent': {'type': 'workspace', 'workspace': True},
              'properties': {'title': {'id': 'title',
                                       'title': [{'annotations': {'bold': False,
 

In [33]:
def process_paginated(endpoint_method, **kwargs):
    next_cursor = None
    while True:
        response = endpoint_method(start_cursor=next_cursor, **kwargs)
        for result in response["results"]:
            yield result
        has_more = response.get("has_more", False)
        if not has_more:
            return
        next_cursor = response["next_cursor"]

all_pages = list(process_paginated(notion.search, query=""))
len(all_pages)

918

In [34]:
all_pages[-1]

{'object': 'page',
 'id': 'f828f0a4-d462-445f-ad0d-0bbc883517bc',
 'created_time': '2020-08-25T08:15:00.000Z',
 'last_edited_time': '2020-08-25T08:15:00.000Z',
 'created_by': {'object': 'user',
  'id': '7c74a801-7cb5-4914-bbbe-2b18cd1ced76'},
 'last_edited_by': {'object': 'user',
  'id': '7c74a801-7cb5-4914-bbbe-2b18cd1ced76'},
 'cover': None,
 'icon': None,
 'parent': {'type': 'database_id',
  'database_id': '64adf49c-9bde-4afd-8440-0ca58f82ecbd'},
 'archived': False,
 'in_trash': False,
 'properties': {'Tags': {'id': "!'(w",
   'type': 'multi_select',
   'multi_select': []},
  'URL': {'id': 'BGD%7B', 'type': 'url', 'url': None},
  'Music': {'id': '_%3BEo', 'type': 'rich_text', 'rich_text': []},
  'Created': {'id': '%7D%25j%7B',
   'type': 'created_time',
   'created_time': '2020-08-25T08:15:00.000Z'},
  'Name': {'id': 'title',
   'type': 'title',
   'title': [{'type': 'text',
     'text': {'content': 'Daily Entry', 'link': None},
     'annotations': {'bold': False,
      'italic': Fa

In [35]:
import sys
sys.path.append("../")

In [None]:
nodes = {}
edges = set()

In [38]:
# Object types: page, database, block, user
# General properties: id, created_time, last_edited_time, text, full_text
# Links: parent, created_by, last_edited_by

import json

from tqdm.notebook import tqdm
from notion_client import APIResponseError

from knowledge_bridge.models import NodeEntity

def process_block(block):
    if block["id"] in nodes:
        tqdm.write(f"Skipping processed block {block['id']}")
        return

    tqdm.write(f"Processing block {block['id']}")
    if block["type"] == "child_page":
        page = notion.pages.retrieve(block["id"])
        process_page(page)
        return
    
    if block["type"] == "child_database":
        try:
            database = notion.databases.retrieve(block["id"])
        except APIResponseError as e:
            if e.code == 404:
                tqdm.write(f"Database {block['id']} not found, skipping.")
                return
            raise
        process_database(database)
        return
    
    node = NodeEntity(
        id=block["id"],
        type="block",
        created=block["created_time"],
        edited=block["last_edited_time"],
        # Dump all content as json
        text=json.dumps(block[block["type"]]),
        link=None
    )
    nodes[node.id] = node

    parent = block.get("parent")
    if parent and parent["type"] != "workspace":
        edge = (parent[parent["type"]], node.id, "child_block")
        edges.add(edge)

    if block.get("has_children"):
        children = process_paginated(notion.blocks.children.list, block_id=block["id"])
        for child in children:
            process_block(child)


def process_page(page):
    if page["id"] in nodes:
        tqdm.write(f"Skipping processed page {page['id']}")
        return
    
    tqdm.write(f"Processing page {page['id']}")
    node = NodeEntity(
        id=page["id"],
        type="page",
        created=page["created_time"],
        edited=page["last_edited_time"],
        # Dump all content as json
        text=json.dumps(page["properties"]),
        link=page["url"],
    )
    nodes[node.id] = node

    parent = page.get("parent")
    if parent and parent["type"] != "workspace":
        edge = (parent[parent["type"]], node.id, "child_page")
        edges.add(edge)

    blocks = process_paginated(notion.blocks.children.list, block_id=page["id"])

    for block in blocks:
        process_block(block)


def process_database(database):
    if database["id"] in nodes:
        tqdm.write(f"Skipping processed database {database['id']}")
        return
    
    tqdm.write(f"Processing database {database['id']}")
    node = NodeEntity(
        id=database["id"],
        type="database",
        created=database["created_time"],
        edited=database["last_edited_time"],
        # Dump all content as json
        text=json.dumps(database["properties"]),
        link=None
    )
    nodes[node.id] = node

    parent = database.get("parent")
    if parent and parent["type"] != "workspace":
        edge = (parent[parent["type"]], node.id, "child_database")
        edges.add(edge)

    pages = process_paginated(notion.databases.query, database_id=database["id"])

    for page in pages:
        process_page(page)

In [39]:
for page in tqdm(all_pages):
    process_page(page)

  0%|          | 0/918 [00:00<?, ?it/s]

Skipping processed page 7bd884b0-e535-476a-a9e5-2feafce5afa2
Skipping processed page 2cf4a74a-c06e-43ae-a8b7-78cb4697d59d
Skipping processed page 631ad647-f0c9-4a78-934b-80a49d8bd1f8
Skipping processed page 07c54a9c-8302-47e9-a323-1738c6f18836
Skipping processed page 9e161efb-10e0-42e6-8d79-4d97dd79333a
Skipping processed page f283b734-345b-4939-a72a-f0c544a83899
Skipping processed page eca0882a-b628-4b68-bca2-b2cd7cdafd88
Skipping processed page 9a65c6e4-ee8e-4806-addd-3f434f35e8e3
Skipping processed page b3c79543-1e0f-45f8-9270-8595d627dc92
Skipping processed page 09df4055-1f63-4b7c-bb49-dd958ff6d176
Skipping processed page 0d00d7f2-573e-4fbc-a0f9-fea00afdd36d
Skipping processed page 2fd5f22f-1afb-41f4-a72a-9cc82a8609c0
Skipping processed page 969e273c-4b67-4aa1-ad75-9e38a019d069
Skipping processed page 3b5f6f38-959d-45a5-89dd-678713307897
Skipping processed page 82127e4c-0656-4e1d-97d4-1fb69d86ecf9
Skipping processed page 83a2c178-f7ef-4bef-8152-d7cd8c0bc6ca
Skipping processed page 

In [40]:
len(nodes)

6953

In [41]:
len(edges)

6943