In [1]:
# Import required libraries
from pyspark.sql import SparkSession
from pathlib import Path

# Configure paths
DATA_PATH = "/home/iceberg/notebooks/data"
DATABASE_NAME = "nyc"
TABLE_NAME = "taxis"

print("Libraries and paths configured.")

Libraries and paths configured.


In [2]:
# import requests

# # Define the URL of the JAR file
# jar_url = "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.7.1/iceberg-spark-runtime-3.5_2.12-1.7.1.jar"

# # Define the local path to save the JAR file
# jar_path = "iceberg-spark-runtime-3.5_2.12-1.7.1.jar"

# # Download the JAR
# print(f"Downloading {jar_url}...")
# response = requests.get(jar_url, stream=True)
# if response.status_code == 200:
#     with open(jar_path, "wb") as file:
#         for chunk in response.iter_content(chunk_size=1024):
#             file.write(chunk)
#     print(f"Downloaded JAR to {jar_path}")
# else:
#     print(f"Failed to download JAR. HTTP Status Code: {response.status_code}")


In [3]:

# Initialize SparkSession
def init_spark_session():
    """Initialize Spark session"""
    print("Initializing Spark session...")
    spark = SparkSession \
        .builder \
        .appName("Jupyter") \
        .getOrCreate()
    print("Spark session initialized.")
    return spark

spark = init_spark_session()

Initializing Spark session...
Spark session initialized.


24/12/31 13:57:22 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
# Display all Spark configurations
for item in spark.sparkContext.getConf().getAll():
    print(f"{item[0]} = {item[1]}")

spark.eventLog.enabled = true
spark.driver.port = 34265
spark.history.fs.logDirectory = /home/iceberg/spark-events
spark.sql.warehouse.dir = file:/home/iceberg/notebooks/notebooks/spark-warehouse
spark.sql.catalog.demo.s3.endpoint = http://minio:9000
spark.eventLog.dir = /home/iceberg/spark-events
spark.app.id = local-1735653439751
spark.serializer.objectStreamReset = 100
spark.master = local[*]
spark.submit.deployMode = client
spark.app.startTime = 1735653438288
spark.driver.extraJavaOptions = -Djava.net.preferIPv6Addresses=false -XX:+IgnoreUnrecognizedVMOptions --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.invoke=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens=java.base/java.io=ALL-UNNAMED --add-opens=java.base/java.net=ALL-UNNAMED --add-opens=java.base/java.nio=ALL-UNNAMED --add-opens=java.base/java.util=ALL-UNNAMED --add-opens=java.base/java.util.concurrent=ALL-UNNAMED --add-opens=java.base/java.util.concurrent.atomic=ALL-UN

In [5]:

# Discover all parquet files in the data directory
def discover_parquet_files(data_path):
    """Find all parquet files in the data directory."""
    parquet_files = list(Path(data_path).glob("yellow_tripdata_*.parquet"))
    print(f"Found {len(parquet_files)} parquet files:")
    for file in sorted(parquet_files):
        print(f"- {file.name}")
    return sorted(parquet_files)

parquet_files = discover_parquet_files(DATA_PATH)
print(f"Parquet files discovered: {parquet_files}")

Found 24 parquet files:
- yellow_tripdata_2022-01.parquet
- yellow_tripdata_2022-02.parquet
- yellow_tripdata_2022-03.parquet
- yellow_tripdata_2022-04.parquet
- yellow_tripdata_2022-05.parquet
- yellow_tripdata_2022-06.parquet
- yellow_tripdata_2022-07.parquet
- yellow_tripdata_2022-08.parquet
- yellow_tripdata_2022-09.parquet
- yellow_tripdata_2022-10.parquet
- yellow_tripdata_2022-11.parquet
- yellow_tripdata_2022-12.parquet
- yellow_tripdata_2023-01.parquet
- yellow_tripdata_2023-02.parquet
- yellow_tripdata_2023-03.parquet
- yellow_tripdata_2023-04.parquet
- yellow_tripdata_2023-05.parquet
- yellow_tripdata_2023-06.parquet
- yellow_tripdata_2023-07.parquet
- yellow_tripdata_2023-08.parquet
- yellow_tripdata_2023-09.parquet
- yellow_tripdata_2023-10.parquet
- yellow_tripdata_2023-11.parquet
- yellow_tripdata_2023-12.parquet
Parquet files discovered: [PosixPath('/home/iceberg/notebooks/data/yellow_tripdata_2022-01.parquet'), PosixPath('/home/iceberg/notebooks/data/yellow_tripdata_20

In [6]:

# Create an Iceberg table
def create_iceberg_table(spark):
    """Create Iceberg table."""
    print("\nCreating Iceberg table...")

    spark.sql(f"CREATE DATABASE IF NOT EXISTS {DATABASE_NAME}")

    spark.sql(f"""
    DROP TABLE IF EXISTS {DATABASE_NAME}.{TABLE_NAME}
    """)

    spark.sql(f"""
    CREATE TABLE IF NOT EXISTS {DATABASE_NAME}.{TABLE_NAME} (
        VendorID              BIGINT,
        tpep_pickup_datetime  TIMESTAMP,
        tpep_dropoff_datetime TIMESTAMP,
        passenger_count       DOUBLE,
        trip_distance         DOUBLE,
        RatecodeID            DOUBLE,
        store_and_fwd_flag    STRING,
        PULocationID          BIGINT,
        DOLocationID          BIGINT,
        payment_type          BIGINT,
        fare_amount           DOUBLE,
        extra                 DOUBLE,
        mta_tax               DOUBLE,
        tip_amount            DOUBLE,
        tolls_amount          DOUBLE,
        improvement_surcharge DOUBLE,
        total_amount          DOUBLE,
        congestion_surcharge  DOUBLE,
        airport_fee           DOUBLE
    )
    USING iceberg
    PARTITIONED BY (days(tpep_pickup_datetime))
    TBLPROPERTIES (
        'write.metadata.version-hint.enabled'='true',
        'write.metadata.metrics.default'='truncate(16)',
        'write.metadata.metrics.column.VendorID'='full',
        'write.metadata.metrics.column.tpep_pickup_datetime'='full',
        'format-version'='2',
        'write.delete.mode'='merge-on-read',
        'write.distribution-mode'='hash',
        'write.parquet.compression-codec'='gzip'
    )
    """)
    print(f"Iceberg table '{DATABASE_NAME}.{TABLE_NAME}' created.")

create_iceberg_table(spark)


Creating Iceberg table...
Iceberg table 'nyc.taxis' created.


In [7]:
 print(f"Iceberg table '{DATABASE_NAME}.{TABLE_NAME}' created with version hint configuration.")

# Verify the table properties
properties = spark.sql(f"""
DESCRIBE TABLE EXTENDED {DATABASE_NAME}.{TABLE_NAME}
""").collect()

print("\nVerifying table properties:")
for row in properties:
    if 'Table Properties' in str(row):
        print(f"Table Properties: {row}")

Iceberg table 'nyc.taxis' created with version hint configuration.

Verifying table properties:
Table Properties: Row(col_name='Table Properties', data_type='[current-snapshot-id=none,format=iceberg/parquet,format-version=2,write.delete.mode=merge-on-read,write.distribution-mode=hash,write.metadata.metrics.column.VendorID=full,write.metadata.metrics.column.tpep_pickup_datetime=full,write.metadata.metrics.default=truncate(16),write.metadata.version-hint.enabled=true,write.parquet.compression-codec=gzip]', comment='')


In [8]:

# Load data into Iceberg table
def load_data_to_iceberg(spark, parquet_files):
    """Load data from Parquet files into Iceberg table."""
    print("\nLoading data into Iceberg table...")

    for file in parquet_files:
        print(f"Loading {file.name}...")
        df = spark.read.parquet(str(file))
        df.write.mode("append").saveAsTable(f"{DATABASE_NAME}.{TABLE_NAME}")
        print(f"File {file.name} loaded.")

load_data_to_iceberg(spark, parquet_files)


Loading data into Iceberg table...
Loading yellow_tripdata_2022-01.parquet...


                                                                                

File yellow_tripdata_2022-01.parquet loaded.
Loading yellow_tripdata_2022-02.parquet...


                                                                                

File yellow_tripdata_2022-02.parquet loaded.
Loading yellow_tripdata_2022-03.parquet...


                                                                                

File yellow_tripdata_2022-03.parquet loaded.
Loading yellow_tripdata_2022-04.parquet...


                                                                                

File yellow_tripdata_2022-04.parquet loaded.
Loading yellow_tripdata_2022-05.parquet...


                                                                                

File yellow_tripdata_2022-05.parquet loaded.
Loading yellow_tripdata_2022-06.parquet...


                                                                                

File yellow_tripdata_2022-06.parquet loaded.
Loading yellow_tripdata_2022-07.parquet...


                                                                                

File yellow_tripdata_2022-07.parquet loaded.
Loading yellow_tripdata_2022-08.parquet...


                                                                                

File yellow_tripdata_2022-08.parquet loaded.
Loading yellow_tripdata_2022-09.parquet...


                                                                                

File yellow_tripdata_2022-09.parquet loaded.
Loading yellow_tripdata_2022-10.parquet...


                                                                                

File yellow_tripdata_2022-10.parquet loaded.
Loading yellow_tripdata_2022-11.parquet...


                                                                                

File yellow_tripdata_2022-11.parquet loaded.
Loading yellow_tripdata_2022-12.parquet...


                                                                                

File yellow_tripdata_2022-12.parquet loaded.
Loading yellow_tripdata_2023-01.parquet...


                                                                                

File yellow_tripdata_2023-01.parquet loaded.
Loading yellow_tripdata_2023-02.parquet...


                                                                                

File yellow_tripdata_2023-02.parquet loaded.
Loading yellow_tripdata_2023-03.parquet...


                                                                                

File yellow_tripdata_2023-03.parquet loaded.
Loading yellow_tripdata_2023-04.parquet...


                                                                                

File yellow_tripdata_2023-04.parquet loaded.
Loading yellow_tripdata_2023-05.parquet...


                                                                                

File yellow_tripdata_2023-05.parquet loaded.
Loading yellow_tripdata_2023-06.parquet...


                                                                                

File yellow_tripdata_2023-06.parquet loaded.
Loading yellow_tripdata_2023-07.parquet...


                                                                                

File yellow_tripdata_2023-07.parquet loaded.
Loading yellow_tripdata_2023-08.parquet...


                                                                                

File yellow_tripdata_2023-08.parquet loaded.
Loading yellow_tripdata_2023-09.parquet...


                                                                                

File yellow_tripdata_2023-09.parquet loaded.
Loading yellow_tripdata_2023-10.parquet...


                                                                                

File yellow_tripdata_2023-10.parquet loaded.
Loading yellow_tripdata_2023-11.parquet...


                                                                                

File yellow_tripdata_2023-11.parquet loaded.
Loading yellow_tripdata_2023-12.parquet...




File yellow_tripdata_2023-12.parquet loaded.


                                                                                

In [9]:

# Verify the data in the Iceberg table
def verify_table_data(spark):
    """Verify data inside the Iceberg table."""
    print("\nVerifying data in Iceberg table...")

    # Total rows count
    result = spark.sql(f"SELECT COUNT(*) as count FROM {DATABASE_NAME}.{TABLE_NAME}")
    total_rows = result.collect()[0]["count"]
    print(f"Total rows in table: {total_rows}")

    # Sample preview
    print("\nSample data:")
    sample_data = spark.sql(f"SELECT * FROM {DATABASE_NAME}.{TABLE_NAME} LIMIT 5")
    sample_data.show()

verify_table_data(spark)


Verifying data in Iceberg table...
Total rows in table: 77966324

Sample data:
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|VendorID|tpep_pickup_datetime|tpep_dropoff_datetime|passenger_count|trip_distance|RatecodeID|store_and_fwd_flag|PULocationID|DOLocationID|payment_type|fare_amount|extra|mta_tax|tip_amount|tolls_amount|improvement_surcharge|total_amount|congestion_surcharge|airport_fee|
+--------+--------------------+---------------------+---------------+-------------+----------+------------------+------------+------------+------------+-----------+-----+-------+----------+------------+---------------------+------------+--------------------+-----------+
|       2| 2023-10-12 00:11:15|  2023-10-12 00:41:14|            1.0|        10.22|       1.0|             

In [10]:

# Stop Spark session
spark.stop()
print("Spark session stopped.")

Spark session stopped.
