## Load catalog

In [1]:
from pyiceberg.catalog import load_catalog

catalog = load_catalog(
    "docs",
    **{
        "uri": "http://127.0.0.1:8181",
        "warehouse": "warehouse",
        "s3.endpoint": "http://127.0.0.1:9000",
        "py-io-impl": "pyiceberg.io.pyarrow.PyArrowFileIO",
        "s3.access-key-id": "admin",
        "s3.secret-access-key": "password",
    }
)

Create a namespace

In [2]:
catalog.create_namespace("docs_test")

List tables in namespace
> It's blank because we haven't created a table yet

In [3]:
catalog.list_tables("docs_test")

[]

## Create table

In [4]:
from llama_index.core import SimpleDirectoryReader
from llama_index.core.node_parser import SentenceSplitter

docs = SimpleDirectoryReader("../data/").load_data()
parser = SentenceSplitter.from_defaults()
nodes = parser.get_nodes_from_documents(docs)

In [5]:
len(nodes)

22

In [6]:
serialized = [node.model_dump() for node in nodes]

In [7]:
import pandas as pd

df = pd.DataFrame(serialized)
df.head()

Unnamed: 0,id_,embedding,metadata,excluded_embed_metadata_keys,excluded_llm_metadata_keys,relationships,metadata_template,metadata_separator,text,mimetype,start_char_idx,end_char_idx,metadata_seperator,text_template,class_name
0,9e26b30c-dbe5-4731-83b1-124d89a63bf3,,{'file_path': '/Users/tituslim/Documents/Perso...,"[file_name, file_type, file_size, creation_dat...","[file_name, file_type, file_size, creation_dat...",{'1': {'node_id': 'ec21c3ff-e429-4b44-9d12-532...,{key}: {value},\n,What I Worked On\n\nFebruary 2021\n\nBefore co...,text/plain,2,4172,\n,{metadata_str}\n\n{content},TextNode
1,7836be7f-4a7a-4d0c-9a7e-e6bfb8547055,,{'file_path': '/Users/tituslim/Documents/Perso...,"[file_name, file_type, file_size, creation_dat...","[file_name, file_type, file_size, creation_dat...",{'1': {'node_id': 'ec21c3ff-e429-4b44-9d12-532...,{key}: {value},\n,All that seemed left for philosophy were edge ...,text/plain,3459,7784,\n,{metadata_str}\n\n{content},TextNode
2,9124bf3e-9238-4923-8ef7-cb4f04200d8b,,{'file_path': '/Users/tituslim/Documents/Perso...,"[file_name, file_type, file_size, creation_dat...","[file_name, file_type, file_size, creation_dat...",{'1': {'node_id': 'ec21c3ff-e429-4b44-9d12-532...,{key}: {value},\n,"Its brokenness did, as so often happens, gener...",text/plain,6984,11177,\n,{metadata_str}\n\n{content},TextNode
3,fe619921-905b-46c9-a03b-2f1445bc6c38,,{'file_path': '/Users/tituslim/Documents/Perso...,"[file_name, file_type, file_size, creation_dat...","[file_name, file_type, file_size, creation_dat...",{'1': {'node_id': 'ec21c3ff-e429-4b44-9d12-532...,{key}: {value},\n,If he even knew about the strange classes I wa...,text/plain,10369,14708,\n,{metadata_str}\n\n{content},TextNode
4,2e4d036b-3ccb-47fe-a82d-c80d632d06f0,,{'file_path': '/Users/tituslim/Documents/Perso...,"[file_name, file_type, file_size, creation_dat...","[file_name, file_type, file_size, creation_dat...",{'1': {'node_id': 'ec21c3ff-e429-4b44-9d12-532...,{key}: {value},\n,The students and faculty in the painting depar...,text/plain,13874,18165,\n,{metadata_str}\n\n{content},TextNode


In [None]:
from pyiceberg.schema import Schema, NestedField
from pyiceberg.types import (
    StringType,
    IntegerType,
    FloatType,
    ListType,
    MapType,
)

iceberg_schema = Schema(
    NestedField(
        1, "id_", field_type=StringType(), required = True,
    ),
    NestedField(
        2,  
        "embedding",
        field_type=ListType(
            element_id= 2,element_type=FloatType(), elemet_required=False,
        ), 
        required=False
    ),
    NestedField(
        3,  
        "excluded_embed_metadata_keys", 
        field_type=ListType(element_id= 3, element_type=StringType()),
        required=True
    ),
    NestedField(
        4,
        "excluded_llm_metadata_keys",
        field_type=ListType(element_id= 4, element_type=StringType()),
        required=True,
    ),
    NestedField(
        5,  
        "relationships", 
        field_type=MapType(key_id = 5, key_type=StringType(), value_id=5, value_type=StringType()),
        rquired = True,
    ),
    NestedField(
        6, "metadata_template", field_type=StringType(), required=True
    ),
    NestedField(
        7, "metadata_separator", field_type=StringType(), required=True
    ),
    NestedField(
        8, "text", field_type=StringType(), required=True
    ),
    NestedField(
        9, "minetype",field_type=StringType(), required=True
    ),
    NestedField(
        10, "start_char_idx", field_type=IntegerType(),required=True
    ),
    NestedField(
        11, "end_char_idx", field_type=IntegerType(),required=True
    ),
    NestedField(
        12, "metadata_seperator", field_type=StringType(),required=True
    ),
    NestedField(
        13, "text_template", field_type=StringType(),required=True
    ),
    NestedField(
        14, "class_name", field_type=StringType(),required=True
    ),
)

In [9]:
tbl = catalog.create_table(
    identifier="docs_test.bids",
    schema=iceberg_schema,
)

## Write data into table

In [10]:
import pyarrow as pa

datatable = pa.Table.from_pandas(df)

In [None]:
tbl.append(datatable)

## Load a table

In [10]:
table = catalog.load_table("docs_test.bids")

In [11]:
table

bids(
  1: id_: required string,
  2: embedding: optional list<float>,
  3: excluded_embed_metadata_keys: required list<string>,
  4: excluded_llm_metadata_keys: required list<string>,
  5: relationships: optional map<string, string>,
  6: metadata_template: required string,
  7: metadata_separator: required string,
  8: text: required string,
  9: minetype: required string,
  10: start_char_idx: required int,
  11: end_char_idx: required int,
  12: metadata_seperator: required string,
  13: text_template: required string,
  14: class_name: required string
),
partition by: [],
sort order: [],
snapshot: null

Inspect schema

In [None]:
schema = table.schema().as_arrow()
schema

id_: large_string not null
  -- field metadata --
  PARQUET:field_id: '1'
embedding: large_list<element: float not null>
  child 0, element: float not null
    -- field metadata --
    PARQUET:field_id: '15'
  -- field metadata --
  PARQUET:field_id: '2'
excluded_embed_metadata_keys: large_list<element: large_string not null> not null
  child 0, element: large_string not null
    -- field metadata --
    PARQUET:field_id: '16'
  -- field metadata --
  PARQUET:field_id: '3'
excluded_llm_metadata_keys: large_list<element: large_string not null> not null
  child 0, element: large_string not null
    -- field metadata --
    PARQUET:field_id: '17'
  -- field metadata --
  PARQUET:field_id: '4'
relationships: map<large_string, large_string>
  child 0, entries: struct<key: large_string not null, value: large_string not null> not null
      child 0, key: large_string not null
      -- field metadata --
      PARQUET:field_id: '18'
      child 1, value: large_string not null
      -- field met

## Inspect snapshots

In [13]:
table.inspect.snapshots()

pyarrow.Table
committed_at: timestamp[ms] not null
snapshot_id: int64 not null
parent_id: int64
operation: string
manifest_list: string not null
summary: map<string, string>
  child 0, entries: struct<key: string not null, value: string> not null
      child 0, key: string not null
      child 1, value: string
----
committed_at: [[]]
snapshot_id: [[]]
parent_id: [[]]
operation: [[]]
manifest_list: [[]]
summary: [[]]