In [1]:
import os
import logging

from pyspark.sql import SparkSession, types
from common.table import table_exists
from common.ema import make_ema_in_chunks

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
landing_date = "2025-09-28"
symbol = "ADAUSDT"
logger.info(f"Transforming symbol={symbol} for date={landing_date}")

PROJECT_PREFIX_UNDERSCORE = os.getenv("PROJECT_PREFIX_UNDERSCORE")
DATA_LAKE_BUCKET = os.getenv("DATA_LAKE_BUCKET")
ICEBERG_LOCK_TABLE = os.getenv("ICEBERG_LOCK_TABLE")

INFO:__main__:Transforming symbol=ADAUSDT for date=2025-09-28


In [4]:
spark = (
    SparkSession.builder
    .appName("Transform Job - Pattern Two") # type: ignore
    .master("local[*]")
    .config("spark.sql.session.timeZone", "UTC")
    .config(
        "spark.sql.extensions",
        "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions",
    )
    .config("spark.sql.catalog.glue_catalog", "org.apache.iceberg.spark.SparkCatalog")
    .config(
        "spark.sql.catalog.glue_catalog.catalog-impl",
        "org.apache.iceberg.aws.glue.GlueCatalog",
    )
    .config("spark.sql.catalog.glue_catalog.lock.table", f"{ICEBERG_LOCK_TABLE}")
    # Disable vectorized reader
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.columnVector.offheap.enabled", "false")
    .config("spark.memory.offHeap.enabled", "false")
    .config("spark.sql.catalog.glue_catalog.read.parquet.vectorization.enabled", "false")
    .config("spark.driver.memory", "2g")
    .config("spark.driver.extraJavaOptions", "-XX:MaxDirectMemorySize=1g")
    .config("spark.sql.codegen.wholeStage", "false")
    .config(
        "spark.jars.packages",
        ",".join([
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1",
            "org.apache.iceberg:iceberg-aws-bundle:1.7.1",
        ])
    )
    .getOrCreate()
)

25/12/05 16:04:36 WARN Utils: Your hostname, Nguyens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.29 instead (on interface en0)
25/12/05 16:04:36 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/anhtu/.ivy2/cache
The jars for the packages stored in: /Users/anhtu/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-04953f11-0d9c-4f9d-bd05-dfe88fecc784;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.7.1 in central
	found org.apache.iceberg#iceberg-aws-bundle;1.7.1 in central
:: resolution report :: resolve 57ms :: artifacts dl 2ms
	:: modules in use:
	org.apache.iceberg#iceberg-aws-bundle;1.7.1 from central in [default]
	org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.7.1 from central in [default]
	---------------

:: loading settings :: url = jar:file:/Users/anhtu/Code/example/crypto-cloud/venv/spark_jobs/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


25/12/05 16:04:37 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [None]:
transform_db = f"glue_catalog.{PROJECT_PREFIX_UNDERSCORE}_transform_db"
klines_table = "klines"
sql_stmt = f"""
select * from {transform_db}.{klines_table}
where landing_date = DATE('{landing_date}') AND symbol = '{symbol}'
"""
df_sorted = (
    spark.sql(sql_stmt)
    .coalesce(1)  # one partition, not shuffle
    .sortWithinPartitions("group_id")
)

logger.info(f"Input rows: {df_sorted.count()}")

INFO:__main__:Input rows: 96                                                    


In [6]:
schema = types.StructType(
    [
        *df_sorted.schema.fields,  # keep all original fields
        types.StructField("ema7", types.DoubleType(), True),
        types.StructField("ema20", types.DoubleType(), True),
    ]
)

In [None]:
pattern_two_table = "pattern_two"
if table_exists(spark, transform_db, pattern_two_table):
    sql_stmt = f"""
    select ema7, ema20 from {transform_db}.{pattern_two_table}
    where landing_date = date_sub(DATE('{landing_date}'), 1) AND symbol = '{symbol}'
    order by group_id desc
    limit 1
    """
    row = spark.sql(sql_stmt).first()
    prev_ema7, prev_ema20 = (row["ema7"], row["ema20"]) if row else (None, None)
else:
    prev_ema7, prev_ema20 = None, None

ema_in_chunks_with_state = make_ema_in_chunks(prev_ema7, prev_ema20)

INFO:common.ema:Using previous EMA values: ema7=0.7825, ema20=0.7825


In [8]:
df = df_sorted.mapInPandas(ema_in_chunks_with_state, schema)

In [9]:
df.createOrReplaceTempView("temp")

df = spark.sql("""
with cte as (
    select
        *,
        case 
            when ema7 > ema20 then 'uptrend' 
            when ema7 < ema20 then 'downtrend' 
            else NULL 
        end as trend,
        LAG(open_price, 1) over(order by group_id) as open_price_prev,
        LAG(close_price, 1) over(order by group_id) as close_price_prev
    from temp
)
select
    group_id,
    group_date,
    open_time,
    open_price,
    high_price,
    low_price,
    close_price,
    volume,
    close_time,
    landing_date,
    symbol,
    ema7,
    ema20,
    trend,
    case 
        when close_price_prev < open_price_prev
            and close_price > open_price
            and open_price < close_price_prev
            and close_price > open_price_prev
            and trend = 'downtrend'
        then 'bullish engulfing'
        when close_price_prev > open_price_prev
            and close_price < open_price
            and open_price > close_price_prev
            and close_price < open_price_prev
            and trend = 'uptrend'
        then 'bearish engulfing'
        else NULL
    end as pattern
from cte
""")

logger.info(f"Output rows: {df.count()}")

INFO:__main__:Output rows: 96                                                   


In [None]:
if table_exists(spark, transform_db, pattern_two_table):
    df.writeTo(f"{transform_db}.{pattern_two_table}").overwritePartitions()
else:
    df.writeTo(f"{transform_db}.{pattern_two_table}").tableProperty(
        "format-version", "2"
    ).partitionedBy("symbol", "landing_date").createOrReplace()

logger.info("✅Transform job completed successfully.")

25/12/05 16:04:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/12/05 16:04:45 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
INFO:__main__:✅Transform job completed successfully.                            


In [None]:
spark.sql(f"""
select * from {transform_db}.{pattern_two_table} where pattern is not null
""").show(20, truncate=False)

                                                                                

+--------+-------------------+----------------+----------+----------+---------+-----------+--------+----------------+------------+-------+------+------+---------+-----------------+
|group_id|group_date         |open_time       |open_price|high_price|low_price|close_price|volume  |close_time      |landing_date|symbol |ema7  |ema20 |trend    |pattern          |
+--------+-------------------+----------------+----------+----------+---------+-----------+--------+----------------+------------+-------+------+------+---------+-----------------+
|1954493 |2025-09-28 07:15:00|1759043701149668|0.7712    |0.7737    |0.7711   |0.773      |509464.1|1759044593275198|2025-09-28  |ADAUSDT|0.7725|0.7739|downtrend|bullish engulfing|
|1954530 |2025-09-28 16:30:00|1759077000870532|0.7876    |0.788     |0.7838   |0.7857     |423035.5|1759077890920769|2025-09-28  |ADAUSDT|0.7833|0.7781|uptrend  |bearish engulfing|
|1954413 |2025-09-27 11:15:00|1758971700405060|0.7837    |0.7853    |0.7837   |0.785      |1215

In [None]:
spark.sql(f"""
select count(*) from {transform_db}.{pattern_two_table}
""").show()

+--------+
|count(1)|
+--------+
|     192|
+--------+



----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 51316)
