In [1]:
import os
import logging

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from common.table import table_exists

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
landing_date = "2025-09-28"
symbol = "ADAUSDT"

PROJECT_PREFIX_UNDERSCORE = os.getenv("PROJECT_PREFIX_UNDERSCORE")
DATA_LAKE_BUCKET = os.getenv("DATA_LAKE_BUCKET")
ICEBERG_LOCK_TABLE = os.getenv("ICEBERG_LOCK_TABLE")

In [4]:
spark = (
    SparkSession.builder
    .appName("Transform Job") # type: ignore
    .master("local[*]")
    .config("spark.sql.session.timeZone", "UTC")
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog")
    .config(
        "spark.sql.catalog.glue_catalog.catalog-impl",
        "org.apache.iceberg.aws.glue.GlueCatalog",
    )
    .config("spark.sql.catalog.glue_catalog.lock.table", f"{ICEBERG_LOCK_TABLE}")
    # Disable vectorized Parquet reader to avoid off-heap memory issues
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.columnVector.offheap.enabled", "false")
    .config("spark.memory.offHeap.enabled", "false")
    .config("spark.sql.catalog.glue_catalog.read.parquet.vectorization.enabled", "false")
    .config("spark.driver.memory", "2g")
    .config("spark.driver.extraJavaOptions", "-XX:MaxDirectMemorySize=1g")
    .config("spark.sql.codegen.wholeStage", "false")
    .config(
        "spark.jars.packages",
        ",".join([
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1",
            "org.apache.iceberg:iceberg-aws-bundle:1.7.1",
            "org.apache.hadoop:hadoop-aws:3.3.4",
        ])
    )
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)

25/12/05 16:03:21 WARN Utils: Your hostname, Nguyens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.29 instead (on interface en0)
25/12/05 16:03:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/anhtu/.ivy2/cache
The jars for the packages stored in: /Users/anhtu/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-7e827d65-9875-435a-b960-dd77b96ddaa4;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/anhtu/Code/example/crypto-cloud/venv/spark_jobs/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.7.1 in central
	found org.apache.iceberg#iceberg-aws-bundle;1.7.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 100ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.apache.iceberg#iceberg-aws-bundle;1.7.1 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.7.1 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-------------------------------

In [5]:
output_path = (
    f"s3://{DATA_LAKE_BUCKET}/landing_zone/spot/daily/aggTrades/{symbol}/{landing_date}"
)
logger.info(f"Reading parquet data from {output_path}")
df = spark.read.parquet(output_path)

INFO:__main__:Reading parquet data from s3://crypto-cloud-dev-650251698703-data-lake-bucket/landing_zone/spot/daily/aggTrades/ADAUSDT/2025-09-28
25/12/05 16:03:24 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

In [6]:
df = (
    df.withColumn("timestamp_date", F.from_unixtime(F.col("timestamp") / 1_000_000))
    .withColumn("timestamp_second", (F.col("timestamp") / 1_000_000).cast("long"))
    .withColumn("group_id", (F.col("timestamp_second") / 900).cast("long"))
    .withColumn("group_date", F.from_unixtime(F.col("group_id") * 900))
    .withColumn("transform_date", F.current_date())
    .withColumn("transform_timestamp", F.current_timestamp())
    .withColumn("landing_date", F.to_date(F.lit(landing_date), "yyyy-MM-dd"))
    .withColumn("symbol", F.lit(symbol))
)

In [7]:
transform_db = f"glue_catalog.{PROJECT_PREFIX_UNDERSCORE}_transform_db"
spark.sql(f"""
CREATE DATABASE IF NOT EXISTS {transform_db}
LOCATION 's3://{DATA_LAKE_BUCKET}/transform_zone/'
""")

DataFrame[]

In [8]:
aggtrade_table = "aggtrades"
if table_exists(spark, transform_db, aggtrade_table):
    df.writeTo(f"{transform_db}.{aggtrade_table}").overwritePartitions()
    logger.info(
        f"Table {transform_db}.{aggtrade_table} overwritten for {symbol} on {landing_date}"
    )
else:
    df.writeTo(f"{transform_db}.{aggtrade_table}").tableProperty(
        "format-version", "2"
    ).partitionedBy("symbol", "landing_date").createOrReplace()
    logger.info(f"Table {transform_db}.{aggtrade_table} created for {symbol} on {landing_date}")

INFO:__main__:Table glue_catalog.crypto_cloud_dev_650251698703_transform_db.aggtrades overwritten for ADAUSDT on 2025-09-28


In [10]:
sql_stmt = f"""
select 
    group_id,
    group_date,
    first(timestamp, true) as open_time,
    round(first(price, true), 4) as open_price,
    round(max(price), 4) as high_price,
    round(min(price), 4) as low_price,
    round(last(price, true), 4) as close_price,
    round(sum(quantity), 1) as volume,
    last(timestamp, true) as close_time,
    landing_date,
    symbol
from {transform_db}.{aggtrade_table}
where landing_date = DATE('{landing_date}') AND symbol = '{symbol}'
group by group_id, group_date, landing_date, symbol
"""
logger.info(f"SQL Statement:\n{sql_stmt}")
df_kline = spark.sql(sql_stmt)

INFO:__main__:SQL Statement:

select 
    group_id,
    group_date,
    first(timestamp, true) as open_time,
    round(first(price, true), 4) as open_price,
    round(max(price), 4) as high_price,
    round(min(price), 4) as low_price,
    round(last(price, true), 4) as close_price,
    round(sum(quantity), 1) as volume,
    last(timestamp, true) as close_time,
    landing_date,
    symbol
from glue_catalog.crypto_cloud_dev_650251698703_transform_db.aggtrades
where landing_date = DATE('2025-09-28') AND symbol = 'ADAUSDT'
group by group_id, group_date, landing_date, symbol



In [None]:
klines_table = "klines"
if table_exists(spark, transform_db, klines_table):
    df_kline.writeTo(f"{transform_db}.{klines_table}").overwritePartitions()
    logger.info(f"Table {transform_db}.{klines_table} overwritten for {symbol} on {landing_date}")
else:
    df_kline.writeTo(f"{transform_db}.{klines_table}").tableProperty(
        "format-version", "2"
    ).partitionedBy("symbol", "landing_date").createOrReplace()
    logger.info(f"Table {transform_db}.{klines_table} created for {symbol} on {landing_date}")


INFO:__main__:Table glue_catalog.crypto_cloud_dev_650251698703_serving_db.klines overwritten for ADAUSDT on 2025-09-28


In [None]:
spark.sql(f"""
select * from {transform_db}.{klines_table} order by group_id desc limit 20
""").show()

                                                                                

+--------+-------------------+----------------+----------+----------+---------+-----------+---------+----------------+------------+-------+
|group_id|         group_date|       open_time|open_price|high_price|low_price|close_price|   volume|      close_time|landing_date| symbol|
+--------+-------------------+----------------+----------+----------+---------+-----------+---------+----------------+------------+-------+
| 1954559|2025-09-28 23:45:00|1759103100242780|    0.8081|     0.809|    0.807|     0.8086| 598970.9|1759103999574241|  2025-09-28|ADAUSDT|
| 1954558|2025-09-28 23:30:00|1759102205055279|    0.8075|    0.8083|   0.8065|     0.8081|1029547.6|1759103099795691|  2025-09-28|ADAUSDT|
| 1954557|2025-09-28 23:15:00|1759101301206536|    0.8096|    0.8096|   0.8067|     0.8076| 730088.8|1759102195688982|  2025-09-28|ADAUSDT|
| 1954556|2025-09-28 23:00:00|1759100400429081|    0.8092|    0.8119|   0.8082|     0.8097|1606695.1|1759101291108358|  2025-09-28|ADAUSDT|
| 1954555|2025-09-28

In [None]:
spark.sql(f"""
select count(*) from {transform_db}.{klines_table}
""").show()

+--------+
|count(1)|
+--------+
|     192|
+--------+

