<img src="./images/logo.svg" alt="lakeFS logo" width=300/> <img src="https://www.apache.org/logos/res/iceberg/iceberg.png" alt="Apache Iceberg logo" width=300/>  

## lakeFS ‚ù§Ô∏è Apache Iceberg - an example of the integration by using PyIceberg client

* [üìö lakeFS Apache Iceberg Integration Docs](https://docs.lakefs.io/integrations/iceberg.html)
* [Getting started with PyIceberg](https://py.iceberg.apache.org/)

## Prerequisites

###### This Notebook requires connecting to lakeFS Cloud or lakeFS Enterprise.
###### Register for the lakeFS Cloud: https://lakefs.cloud/register or Contact Us for a lakeFS Enterprise Key: https://lakefs.io/contact-sales/

## Config

**_If you're not using the provided lakeFS server and MinIO storage then change these values to match your environment_**

### lakeFS endpoint and credentials

In [None]:
lakefsEndPoint = 'http://lakefs:8000' # e.g. 'https://username.aws_region_name.lakefscloud.io' 
lakefsAccessKey = 'AKIAIOSFOLKFSSAMPLES'
lakefsSecretKey = 'wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY'

### Object Storage

In [None]:
storageNamespace = 's3://example' # e.g. "s3://bucket"

---

## Setup

**(you shouldn't need to change anything in this section, just run it)**

In [None]:
repo_name = "lakefs-py-iceberg"

### Versioning Information

In [None]:
mainBranch = "main"
devBranch = "dev"
icebergNamespace = "lakefs_demo"

### Install and import libraries

In [None]:
!pip install pyarrow==17.0.0
!pip install pyiceberg==0.9.1

In [None]:
import os
import lakefs
from assets.lakefs_demo import print_commit, print_diff
from pyiceberg.catalog.rest import RestCatalog
from pyiceberg.schema import Schema
from pyiceberg.types import NestedField, StringType, IntegerType, DateType, DoubleType
import pyarrow as pa
from datetime import datetime

### Set environment variables

In [None]:
os.environ["LAKECTL_SERVER_ENDPOINT_URL"] = lakefsEndPoint
os.environ["LAKECTL_CREDENTIALS_ACCESS_KEY_ID"] = lakefsAccessKey
os.environ["LAKECTL_CREDENTIALS_SECRET_ACCESS_KEY"] = lakefsSecretKey

#### Verify lakeFS credentials by getting lakeFS version

In [None]:
print("Verifying lakeFS credentials‚Ä¶")
try:
    v=lakefs.client.Client().version
except:
    print("üõë failed to get lakeFS version")
else:
    print(f"‚Ä¶‚úÖlakeFS credentials verified\n\n‚ÑπÔ∏èlakeFS version {v}")

### Define lakeFS Repository

In [None]:
repo = lakefs.Repository(repo_name).create(storage_namespace=f"{storageNamespace}/{repo_name}", default_branch=mainBranch, exist_ok=True)
branchMain = repo.branch(mainBranch)
print(repo)

---

---

### Define function to calculate top selling authors

In [None]:
def top_selling_authors(catalog, repo_name, branchName, icebergNamespace):
    table_books = (repo_name, branchName, icebergNamespace, 'books')
    books_table = catalog.load_table(table_books)
    books_arrow_df = books_table.scan().to_arrow()

    table_authors = (repo_name, branchName, icebergNamespace, 'authors')
    authors_table = catalog.load_table(table_authors)
    authors_arrow_df = authors_table.scan().to_arrow()

    table_book_sales = (repo_name, branchName, icebergNamespace, 'book_sales')
    book_sales_table = catalog.load_table(table_book_sales)
    book_sales_arrow_df = book_sales_table.scan().to_arrow()

    authors_arrow_df = authors_arrow_df.rename_columns({'id': 'author_id'})
    books_arrow_df = books_arrow_df.rename_columns({'id': 'book_id'})
    joined_table = book_sales_arrow_df.join(books_arrow_df, keys="book_id").join(authors_arrow_df, keys="author_id")
    
    return joined_table.group_by("name").aggregate([("price", "sum")]).to_pandas().sort_values(by='price_sum', ascending=False)

### Define Iceberg catalog

**_If you're not using the provided MinIO storage then change S3 endpoint (e.g. http://s3.us-east-1.amazonaws.com) and credentials to match your environment_**

In [None]:
catalog = RestCatalog(
    name = "my_catalog",
    **{
    'prefix': 'lakefs',
    'uri': f'{lakefsEndPoint}/iceberg/api',
    'oauth2-server-uri': f'{lakefsEndPoint}/iceberg/api/v1/oauth/tokens',
    'credential': f'{lakefsAccessKey}:{lakefsSecretKey}',
    's3.endpoint': 'http://minio:9000',
    's3.access-key-id': 'minioadmin',
    's3.secret-access-key': 'minioadmin',
    's3.region': 'us-east-1',
    's3.force-virtual-addressing': False,
})

### Create Iceberg namespace

In [None]:
print(f"repo_name: {repo_name}, mainBranch: {mainBranch}, icebergNamespace: {icebergNamespace}")

In [None]:
lakefs_demo_ns = (repo_name, mainBranch, icebergNamespace)
catalog.create_namespace(lakefs_demo_ns)

### List namespaces in the main branch

In [None]:
catalog.list_namespaces((repo_name, mainBranch))

---

## Create an Iceberg table in the lakeFS catalog `main` branch

In [None]:
# create authors table
authors_schema = Schema(
    NestedField(
        field_id=1,
        name="id",
        field_type=IntegerType(),
        required=True
    ),
    NestedField(
        field_id=2,
        name="name",
        field_type=StringType(),
        required=True
    ),
)
table_authors = (repo_name, mainBranch, icebergNamespace, 'authors')

catalog.create_table(
    identifier=table_authors,
    schema=authors_schema
)

In [None]:
# create books table
books_schema = Schema(
    NestedField(
        field_id=1,
        name="id",
        field_type=IntegerType(),
        required=True
    ),
    NestedField(
        field_id=2,
        name="title",
        field_type=StringType(),
        required=True
    ),
    NestedField(
        field_id=3,
        name="author_id",
        field_type=IntegerType(),
        required=True
    ),
)
table_books = (repo_name, mainBranch, icebergNamespace, 'books')

catalog.create_table(
    identifier=table_books,
    schema=books_schema
)

In [None]:
# create book_sales table
book_sales_schema = Schema(
    NestedField(
        field_id=1,
        name="id",
        field_type=IntegerType(),
        required=True
    ),
    NestedField(
        field_id=2,
        name="sale_date",
        field_type=DateType(),
        required=True
    ),
    NestedField(
        field_id=3,
        name="book_id",
        field_type=IntegerType(),
        required=True
    ),
    NestedField(
        field_id=4,
        name="price",
        field_type=DoubleType(),
        required=True
    ),
)
table_book_sales = (repo_name, mainBranch, icebergNamespace, 'book_sales')

catalog.create_table(
    identifier=table_book_sales,
    schema=book_sales_schema
)

### List tables in the main branch

In [None]:
catalog.list_tables((repo_name, mainBranch, icebergNamespace))

### Insert data into tables

In [None]:
# Insert data into the authors table
authors_data = [
    {"id": 1, "name": "J.R.R. Tolkien"},
    {"id": 2, "name": "George R.R. Martin"},
    {"id": 3, "name": "Agatha Christie"},
    {"id": 4, "name": "Isaac Asimov"},
    {"id": 5, "name": "Stephen King"},
]

authors_arrow_schema = pa.schema([
    pa.field("id", pa.int8(), nullable=False),
    pa.field("name", pa.string(), nullable=False),
])
authors_arrow_table = pa.Table.from_pylist(authors_data, schema=authors_arrow_schema)
authors_table = catalog.load_table(table_authors)
authors_table.append(authors_arrow_table)

In [None]:
# Insert data into the books table
books_data = [
    {"id": 1, "title": "The Lord of the Rings","author_id": 1},
    {"id": 2, "title": "The Hobbit","author_id": 1},
    {"id": 3, "title": "A Song of Ice and Fire","author_id": 2},
    {"id": 4, "title": "A Clash of Kings","author_id": 2},
    {"id": 5, "title": "And Then There Were None","author_id": 3},
    {"id": 6, "title": "Murder on the Orient Express","author_id": 3},
    {"id": 7, "title": "Foundation","author_id": 4},
    {"id": 8, "title": "I, Robot","author_id": 4},
    {"id": 9, "title": "The Shining","author_id": 5},
    {"id": 10, "title": "It","author_id": 5},
]

books_arrow_schema = pa.schema([
    pa.field("id", pa.int8(), nullable=False),
    pa.field("title", pa.string(), nullable=False),
    pa.field("author_id", pa.int8(), nullable=False),
])
books_arrow_table = pa.Table.from_pylist(books_data, schema=books_arrow_schema)
books_table = catalog.load_table(table_books)
books_table.append(books_arrow_table)

In [None]:
# Insert data into the book_sales table
book_sales_data = [
    {"id": 1, "sale_date": datetime(2024, 4, 12),"book_id": 1,"price": 25.50},
    {"id": 2, "sale_date": datetime(2024, 4, 11),"book_id": 2,"price": 17.99},
    {"id": 3, "sale_date": datetime(2024, 4, 10),"book_id": 3,"price": 12.95},
    {"id": 4, "sale_date": datetime(2024, 4, 13),"book_id": 4,"price": 32.00},
    {"id": 5, "sale_date": datetime(2024, 4, 12),"book_id": 5,"price": 29.99},
    {"id": 6, "sale_date": datetime(2024, 3, 15),"book_id": 1,"price": 23.99},
    {"id": 7, "sale_date": datetime(2024, 2, 22),"book_id": 2,"price": 19.50},
    {"id": 8, "sale_date": datetime(2024, 1, 10),"book_id": 3,"price": 14.95},
    {"id": 9, "sale_date": datetime(2023, 12, 5),"book_id": 4,"price": 28.00},
    {"id": 10, "sale_date": datetime(2023, 11, 18),"book_id": 5,"price": 27.99},
    {"id": 11, "sale_date": datetime(2023, 10, 26),"book_id": 2,"price": 18.99},
    {"id": 12, "sale_date": datetime(2023, 10, 12),"book_id": 1,"price": 22.50},
    {"id": 13, "sale_date": datetime(2024, 4, 9),"book_id": 3,"price": 11.95},
    {"id": 14, "sale_date": datetime(2024, 3, 28),"book_id": 4,"price": 35.00},
    {"id": 15, "sale_date": datetime(2024, 4, 5),"book_id": 5,"price": 31.99},
    {"id": 16, "sale_date": datetime(2024, 3, 1),"book_id": 1,"price": 27.50},
    {"id": 17, "sale_date": datetime(2024, 2, 14),"book_id": 2,"price": 21.99},
    {"id": 18, "sale_date": datetime(2024, 1, 7),"book_id": 3,"price": 13.95},
    {"id": 19, "sale_date": datetime(2023, 12, 20),"book_id": 4,"price": 29.00},
    {"id": 20, "sale_date": datetime(2023, 11, 3),"book_id": 5,"price": 28.99},
]

book_sales_arrow_schema = pa.schema([
    pa.field("id", pa.int8(), nullable=False),
    pa.field("sale_date", pa.date32(), nullable=False),
    pa.field("book_id", pa.int8(), nullable=False),
    pa.field("price", pa.float64(), nullable=False),
])
book_sales_arrow_table = pa.Table.from_pylist(book_sales_data, schema=book_sales_arrow_schema)
book_sales_table = catalog.load_table(table_book_sales)
book_sales_table.append(book_sales_arrow_table)

# Main demo starts here üö¶ üëáüèª

## Read my production data from my main branch

In [None]:
table_authors = (repo_name, mainBranch, icebergNamespace, 'authors')
authors_table = catalog.load_table(table_authors)
arrow_df = authors_table.scan().to_arrow()
arrow_df.to_pandas()

In [None]:
table_books = (repo_name, mainBranch, icebergNamespace, 'books')
books_table = catalog.load_table(table_books)
arrow_df = books_table.scan().to_arrow()
arrow_df.to_pandas()

In [None]:
table_book_sales = (repo_name, mainBranch, icebergNamespace, 'book_sales')
book_sales_table = catalog.load_table(table_book_sales)
arrow_df = book_sales_table.scan().to_arrow()
arrow_df.to_pandas()

## Mess with the data - Create a development sandbox

In [None]:
branchDev = repo.branch(devBranch).create(source_reference=mainBranch, exist_ok=True)
print(f"{devBranch} ref:", branchDev.get_commit().id)

## Read data from my development sandbox

In [None]:
table_book_sales = (repo_name, devBranch, icebergNamespace, 'book_sales')
book_sales_table = catalog.load_table(table_book_sales)
arrow_df = book_sales_table.scan().to_arrow()
arrow_df.to_pandas()

## Running pipelines in isolation

### Remove Cancelled Sales

In [None]:
table_book_sales = (repo_name, devBranch, icebergNamespace, 'book_sales')
book_sales_table = catalog.load_table(table_book_sales)
book_sales_table.delete(delete_filter="id IN (1, 2, 6, 10, 15)")

arrow_df = book_sales_table.scan().to_arrow()
arrow_df.to_pandas()

### Who are my top selling authors?

### Compare dev and main

In [None]:
top_selling_authors(catalog, repo_name, devBranch, icebergNamespace)

In [None]:
top_selling_authors(catalog, repo_name, mainBranch, icebergNamespace)

### Merge Changes

In [None]:
res = branchDev.merge_into(branchMain)
print(res)

### If you merged new branch to the main branch then you can atomically rollback all changes

In [None]:
branchMain.revert(parent_number=1, reference=mainBranch)

## More Questions?

###### Join the lakeFS Slack group - https://lakefs.io/slack