In [None]:
import os
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.table import StreamTableEnvironment

from pyflink.table import EnvironmentSettings, TableEnvironment

import get_env
env = get_env.get_remote_env()
t_env = StreamTableEnvironment.create(env)

conf = t_env.get_config().get_configuration()
 

# === Python Exec Location ===
conf.set_string("python.executable", "/usr/bin/python3")
conf.set_string("pipeline.jars", "file:///opt/flink/plugins/gs-fs-hadoop/flink-gs-fs-hadoop-1.20.2.jar")  # client-side path

# === Allow fallback to Hadoop FS for gs:// and s3:// ===
conf.set_string("fs.allowed-fallback-filesystems", "hadoop")
 

t_env.get_config().set("parallelism.default", "1")

In [None]:
conf = t_env.get_config().get_configuration()
conf.set_string("fs.allowed-fallback-filesystems", "hadoop")
conf.set_string("fs.gs.project.id", "flink-demo-470113")
conf.set_string("fs.gs.auth.service.account.json.keyfile", "/etc/gcp/key.json")

In [None]:
# ------------ A: Catalog (Hadoop-style, backing files in GCS) -------------
t_env.execute_sql("DROP CATALOG IF EXISTS ordercat")

t_env.execute_sql("""
CREATE CATALOG IF NOT EXISTS ordercat WITH (
  'type' = 'iceberg',
  'catalog-type' = 'hadoop',
  -- GCS path for metadata/warehouse; ensure the bucket exists and Flink has write access
  'warehouse' = 'gs://gks-datalake/iceberg-warehouse/',
  'property-version' = '1'
);
""")

# Tell Flink to use it
# t_env.execute_sql("USE CATALOG movielens")
t_env.execute_sql("CREATE DATABASE IF NOT EXISTS ordercat.orderdb")
# t_env.execute_sql("USE gold")


In [None]:
t_env.execute_sql("""
DROP TABLE IF  EXISTS ordercat.orderdb.shipments
""").wait()

"""
-- features shown:
--  - format-version = 2 (row-level deletes/updates support)
--  - partitioned by day(customer use-case)
--  - primary key declared (NOT ENFORCED) â€” helpful for upsert semantics
--  - table properties example: enable upsert by default, set snapshot retention hints
"""

t_env.execute_sql("""
CREATE TABLE IF NOT EXISTS ordercat.orderdb.shipments (
  shipment_id      BIGINT,
  origin           STRING,
  destination      STRING,
  shipped_at       TIMESTAMP(3),
  weight_kg        DOUBLE,
  PRIMARY KEY (shipment_id) NOT ENFORCED
)
WITH (
  'format-version' = '2',
  'write.upsert.enabled' = 'true',
  'snapshot.retention.days' = '7',
  'write.metadata.delete-after-commit' = 'false'
);
""").wait()

In [None]:
result = t_env.sql_query("SELECT * FROM ordercat.orderdb.shipments")

result.execute().print()

In [None]:
# streaming INSERT (returns a TableResult)
t_env.execute_sql("""
INSERT INTO ordercat.orderdb.shipments (shipment_id, origin, destination, shipped_at, weight_kg)
VALUES
  (1001, 'BLR', 'DEL', TIMESTAMP '2025-11-01 08:12:00', 12.5),
  (1002, 'MGR', 'HYD', TIMESTAMP '2025-11-02 09:30:00', 7.75)
""").wait()


In [None]:
result = t_env.sql_query("SELECT * FROM ordercat.orderdb.shipments")

result.execute().print()

In [None]:
# SHOW CREATE TABLE (Flink will print DDL)
res = t_env.execute_sql("SHOW CREATE TABLE ordercat.orderdb.shipments")
for r in res.collect():
    print(r)

In [None]:
t_env.execute_sql("""
DROP TEMPORARY VIEW IF EXISTS upsert_stage
""").wait()

t_env.execute_sql("""
CREATE TEMPORARY VIEW upsert_stage AS
SELECT * FROM (
    VALUES
        (1001, 'BLR', 'PUNE', TIMESTAMP '2025-11-01 08:12:00', 12.0),
         (1003, 'MYS', 'BOM', TIMESTAMP '2025-11-02 09:30:00', 7.75)
) AS t (shipment_id, origin, destination, shipped_at, weight_kg)
""").wait()

In [None]:
result = t_env.sql_query("SELECT * FROM upsert_stage")

result.execute().print()

In [None]:
# works only with iceberg version 2 
# if the rows matched, it updates else insert
# king of merge, but no delete or update specific fields, 
# no exclude/include fields like SPARK/DATABRICKS
t_env.execute_sql("""
INSERT INTO ordercat.orderdb.shipments
/*+ OPTIONS('upsert-enabled'='true') */
SELECT shipment_id, origin, destination, shipped_at, weight_kg
FROM upsert_stage
""").wait()

In [None]:
result = t_env.sql_query("SELECT * FROM ordercat.orderdb.shipments")

result.execute().print()

In [None]:
t_env.execute_sql("""
DROP TEMPORARY VIEW IF EXISTS upsert_stage
""")

In [None]:
# SHOW SNAPSHOTS
# special table, <actual-tablename>$snapshots
# $snapshots is keyword
# Like history
# backtick ` not a single quote. it preserve exact match
result = t_env.sql_query("SELECT * FROM ordercat.orderdb.`shipments$snapshots`")

result.execute().print()

In [None]:
# same snapshots with specific columns, same like previous cell, reduced columns
result = t_env.sql_query("""
SELECT snapshot_id,
       parent_id,
       committed_at,
       operation,
       summary
       FROM ordercat.orderdb.`shipments$snapshots`;
""")

result.execute().print()



In [None]:
# Use Iceberg history system table

result = t_env.sql_query("""
SELECT * 
FROM ordercat.orderdb.`shipments$history`
""")

result.execute().print()


In [None]:
# DISCUSS: What is diffrence between snapshots and history?
# SNAPSHOT can be removed like expired, retention policies, manual clean up
# history is maintained in the meta data directory, available always
# if snapshot removed, we CANNOT do ROLLBACK or TIME TRAVEL 

In [19]:
# Let us do time travel, time travel is all about what happend at particualr version
# based on snapshots, NOT A ROLLBACK

# get snapshots

result = t_env.sql_query("""
SELECT snapshot_id, parent_id, committed_at, operation, summary
FROM ordercat.orderdb.`shipments$snapshots`
""")

result.execute().print()


+----+----------------------+----------------------+----------------------------+--------------------------------+--------------------------------+
| op |          snapshot_id |            parent_id |               committed_at |                      operation |                        summary |
+----+----------------------+----------------------+----------------------------+--------------------------------+--------------------------------+
| +I |   749072624420771859 |               <NULL> | 2025-12-02 16:57:40.086000 |                      overwrite | {added-delete-files=1, flin... |
| +I |  1459376408147079364 |   749072624420771859 | 2025-12-02 17:01:34.118000 |                      overwrite | {added-delete-files=1, flin... |
+----+----------------------+----------------------+----------------------------+--------------------------------+--------------------------------+
2 rows in set


In [26]:
# Time travel by snapshot id (exact)

# replace snapshot id with the snapshot_id you found

# replace snapshot id with value show about
query = t_env.sql_query("""
   SELECT * FROM  ordercat.orderdb.shipments 
   /*+ OPTIONS('scan.snapshot-id'='749072624420771859') */;
""")

query.execute().print()

+----+----------------------+--------------------------------+--------------------------------+----------------------------+--------------------------------+
| op |          shipment_id |                         origin |                    destination |                 shipped_at |                      weight_kg |
+----+----------------------+--------------------------------+--------------------------------+----------------------------+--------------------------------+
| +I |                 1001 |                            BLR |                           PUNE | 2025-11-01 08:12:00.000000 |                           12.0 |
| +I |                 1003 |                            MYS |                            BOM | 2025-11-02 09:30:00.000000 |                           7.75 |
| +I |                 1002 |                            MGR |                            HYD | 2025-11-02 09:30:00.000000 |                           7.75 |
+----+----------------------+-----------------------

In [29]:
# Time travel by timestamp

# replace snapshot id with the snapshot_id you found

from datetime import datetime, timezone

def to_millis(dt_str):
    """
    Convert 'YYYY-MM-DD HH:MM:SS[.ffffff]' to epoch milliseconds.
    Assumes the input is in local time; set tzinfo as needed.
    """
    # Try with microseconds first
    try:
        dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S.%f")
    except ValueError:
        # Fallback to seconds only
        dt = datetime.strptime(dt_str, "%Y-%m-%d %H:%M:%S")
    
    # If your timestamp is UTC, set timezone here:
    dt = dt.replace(tzinfo=timezone.utc)

    # Convert to milliseconds
    return int(dt.timestamp() * 1000)

ts = to_millis("2025-12-02 16:50:00")
print ("time ms", ts)

# replace snapshot id with value show about
query = t_env.sql_query(f"""
   SELECT * FROM  ordercat.orderdb.shipments 
   /*+ OPTIONS('scan.timestamp-millis'='{ts}') */;
""")

query.execute().print() 

time ms 1764694200000
+----+----------------------+--------------------------------+--------------------------------+----------------------------+--------------------------------+
| op |          shipment_id |                         origin |                    destination |                 shipped_at |                      weight_kg |
+----+----------------------+--------------------------------+--------------------------------+----------------------------+--------------------------------+
| +I |                 1002 |                            MGR |                            HYD | 2025-11-02 09:30:00.000000 |                           7.75 |
| +I |                 1001 |                            BLR |                           PUNE | 2025-11-01 08:12:00.000000 |                           12.0 |
| +I |                 1003 |                            MYS |                            BOM | 2025-11-02 09:30:00.000000 |                           7.75 |
+----+----------------------+-