In [10]:
!ls /usr/local/hadoop/warehouse/sample

data  metadata


In [None]:
from iceberg.hive import HiveTables
from iceberg.api.schema import Schema
from iceberg.api.types import TimestampType, DoubleType, StringType, NestedField
from iceberg.api.partition_spec import PartitionSpecBuilder

hive_conf = {
    "hive.metastore.uris": 'thrift://hive_metastore:9083',
    "hive.metastore.warehouse.dir": "/usr/local/hadoop/warehouse"
}
tables = HiveTables(hive_conf)

table_schema = Schema(
                 NestedField.optional(1, "DateTime", TimestampType.with_timezone()),
                 NestedField.optional(2, "Bid", DoubleType.get()),
                 NestedField.optional(3, "Ask", DoubleType.get()),
                 NestedField.optional(4, "symbol", StringType.get()))

partition_spec = PartitionSpecBuilder(table_schema).add(1, 1000, "DateTime_day", "day").build()

table = tables.create(table_schema, "iceberg.test_123", partition_spec)

In [2]:
# define Trino client
from trino.dbapi import connect

trino_connection = connect(
    host="trino",
    port=8080,
    user="iceberg",
    #catalog="<catalog>",
    #schema="<schema>",
)
trino = trino_connection.cursor()

In [3]:
rows = trino.execute("SELECT * FROM system.runtime.nodes").fetchall()
print(rows)

[['5bb73f80928c', 'http://172.18.0.4:8080', '406', True, 'active']]


In [7]:
rows = trino.execute("SELECT * FROM iceberg.default.sample").fetchall()
print(rows)

[[2, 'b'], [2, 'b'], [1, 'a'], [1, 'a']]


In [4]:
# define Spark client
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .config("spark.sql.warehouse.dir", "/usr/local/hadoop/warehouse") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.iceberg.type", "hive") \
    .config("spark.sql.catalog.iceberg.uri", "thrift://hivemetastore:9083") \
    .config("spark.sql.catalog.iceberg.cache-enabled", False) \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/02/12 13:34:01 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
spark.sql("""
    DROP TABLE if exists iceberg.default.sample
""")
spark.sql("""
    CREATE TABLE IF NOT EXISTS iceberg.default.sample (
        id bigint,
        data string
    )
    USING iceberg
""")

DataFrame[]

In [7]:
spark.sql("""
INSERT INTO iceberg.default.sample VALUES (1, 'a'), (2, 'b')
""")

DataFrame[]

In [8]:
spark.sql("""
SELECT file_path FROM iceberg.default.sample.all_data_files
""").show()

+--------------------+
|           file_path|
+--------------------+
|file:/usr/local/h...|
|file:/usr/local/h...|
|file:/usr/local/h...|
|file:/usr/local/h...|
+--------------------+

