In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = (
    SparkSession.builder.appName("TransformKlines")  # type: ignore
    .master("local[*]")
    .config("spark.sql.session.timeZone", "UTC")
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    # Hive Catalog
    .config("spark.sql.catalog.hive_catalog", "org.apache.iceberg.spark.SparkCatalog")
    .config(
        "spark.sql.catalog.hive_catalog.catalog-impl",
        "org.apache.iceberg.hive.HiveCatalog",
    )
    .config(
        "spark.sql.catalog.hive_catalog.uri",
        "thrift://localhost:9083",
    )
    # minio specific configs
    .config(
        "spark.hadoop.fs.s3a.aws.credentials.provider",
        "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider",
    )
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "admin123")
    .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:9000")
    # Disable vectorized Parquet reader to avoid off-heap memory issues
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.columnVector.offheap.enabled", "false")
    .config("spark.memory.offHeap.enabled", "false")
    .config(
        "spark.sql.catalog.glue_catalog.read.parquet.vectorization.enabled", "false"
    )
    .config("spark.driver.memory", "2g")
    .config("spark.driver.extraJavaOptions", "-XX:MaxDirectMemorySize=1g")
    .config("spark.sql.codegen.wholeStage", "false")
    .config(
        "spark.jars.packages",
        ",".join(
            [
                "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1",
                "org.apache.iceberg:iceberg-aws-bundle:1.7.1",
                "org.apache.hadoop:hadoop-aws:3.3.4",
            ]
        ),
    )
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/Users/anhtu/.pyenv/versions/3.11.11/envs/spark/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/anhtu/.ivy2/cache
The jars for the packages stored in: /Users/anhtu/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-62dd526b-e4e1-42f9-830d-017cd02ee7f8;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.7.1 in central
	found org.apache.iceberg#iceberg-aws-bundle;1.7.1 in central
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 102ms :: artifacts dl 4ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.apache.iceberg#iceberg-aws-bundle;1.7.1 from cent

In [3]:
spark.sql("SHOW DATABASES IN hive_catalog").show(truncate=False)

+------------+
|namespace   |
+------------+
|default     |
|transform_db|
+------------+



In [4]:
spark.sql("SHOW TABLES IN hive_catalog.transform_db").show(truncate=False)

+------------+-----------+-----------+
|namespace   |tableName  |isTemporary|
+------------+-----------+-----------+
|transform_db|aggtrades  |false      |
|transform_db|klines     |false      |
|transform_db|pattern_two|false      |
+------------+-----------+-----------+



In [5]:
spark.sql("""
CREATE DATABASE IF NOT EXISTS hive_catalog.transform_db
LOCATION 's3a://data-lake-bucket/transform_zone/'
""")

DataFrame[]

In [6]:
spark.sql("select * from hive_catalog.transform_db.pattern_two where pattern is not null").show(truncate=False)

26/01/04 21:24:03 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


+--------+-------------------+----------------+----------+----------+---------+-----------+--------+----------------+------------+-------+------+------+---------+-----------------+
|group_id|group_date         |open_time       |open_price|high_price|low_price|close_price|volume  |close_time      |landing_date|symbol |ema7  |ema20 |trend    |pattern          |
+--------+-------------------+----------------+----------+----------+---------+-----------+--------+----------------+------------+-------+------+------+---------+-----------------+
|1954413 |2025-09-27 11:15:00|1758971700405060|0.7837    |0.7853    |0.7837   |0.785      |121594.4|1758972591323187|2025-09-27  |ADAUSDT|0.7835|0.784 |downtrend|bullish engulfing|
|1954445 |2025-09-27 19:15:00|1759000502967967|0.7803    |0.7823    |0.78     |0.7821     |95233.0 |1759001384883232|2025-09-27  |ADAUSDT|0.7808|0.7817|downtrend|bullish engulfing|
|1954493 |2025-09-28 07:15:00|1759043701149668|0.7712    |0.7737    |0.7711   |0.773      |5094

                                                                                

In [7]:
spark.sql("select * from hive_catalog.transform_db.pattern_two order by group_date DESC limit 20").show(truncate=False)

+--------+-------------------+----------------+----------+----------+---------+-----------+---------+----------------+------------+-------+------+------+-------+-------+
|group_id|group_date         |open_time       |open_price|high_price|low_price|close_price|volume   |close_time      |landing_date|symbol |ema7  |ema20 |trend  |pattern|
+--------+-------------------+----------------+----------+----------+---------+-----------+---------+----------------+------------+-------+------+------+-------+-------+
|1954559 |2025-09-28 23:45:00|1759103100242780|0.8081    |0.809     |0.807    |0.8086     |598970.9 |1759103999574241|2025-09-28  |ADAUSDT|0.8068|0.8002|uptrend|NULL   |
|1954558 |2025-09-28 23:30:00|1759102205055279|0.8075    |0.8083    |0.8065   |0.8081     |1029547.6|1759103099795691|2025-09-28  |ADAUSDT|0.8062|0.7993|uptrend|NULL   |
|1954557 |2025-09-28 23:15:00|1759101301206536|0.8096    |0.8096    |0.8067   |0.8076     |730088.8 |1759102195688982|2025-09-28  |ADAUSDT|0.8055|0.79