In [1]:
import logging
import os

from pyspark.sql import SparkSession, types
from pyspark.sql import functions as F
from shared_lib.s3 import upload_to_s3
from shared_lib.file import download_file, remove_file, extract_file 

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
landing_date = "2025-09-28"
symbol = "ADAUSDT"

DATA_LAKE_BUCKET = os.getenv("DATA_LAKE_BUCKET")

In [4]:
script_dir = "/tmp/data/raw"
extract_dir = os.path.join(script_dir, "unzipped_data")
url = f"https://data.binance.vision/data/spot/daily/aggTrades/{symbol}/{symbol}-aggTrades-{landing_date}.zip"
zip_path = os.path.join(script_dir, url.split("/")[-1])

download_file(url, zip_path)
csv_path = extract_file(extract_dir, zip_path)


INFO:shared_lib.file:Downloading https://data.binance.vision/data/spot/daily/aggTrades/ADAUSDT/ADAUSDT-aggTrades-2025-09-28.zip -> /tmp/data/raw/ADAUSDT-aggTrades-2025-09-28.zip
INFO:shared_lib.file:Downloaded /tmp/data/raw/ADAUSDT-aggTrades-2025-09-28.zip 0.87MB completed in 0.449 seconds
INFO:shared_lib.file:Extracting /tmp/data/raw/ADAUSDT-aggTrades-2025-09-28.zip -> /tmp/data/raw/unzipped_data
INFO:shared_lib.file:Extracted CSV: /tmp/data/raw/unzipped_data/ADAUSDT-aggTrades-2025-09-28.csv in 0.013 seconds


In [5]:
spark = (
    SparkSession.builder
    .appName("Landing Job") # type: ignore
    .master("local[*]")
    .config("spark.sql.session.timeZone", "UTC")
    # local mode optimizations to reduce memory consumption
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.columnVector.offheap.enabled", "false")
    .config("spark.memory.offHeap.enabled", "false")
    .config("spark.sql.catalog.glue_catalog.read.parquet.vectorization.enabled", "false")
    .config("spark.driver.memory", "2g")
    .config("spark.driver.extraJavaOptions", "-XX:MaxDirectMemorySize=1g")
    .config("spark.sql.codegen.wholeStage", "false")
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4")
    .config("spark.hadoop.fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .getOrCreate()
)

25/12/05 16:01:56 WARN Utils: Your hostname, Nguyens-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.29 instead (on interface en0)
25/12/05 16:01:56 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Ivy Default Cache set to: /Users/anhtu/.ivy2/cache
The jars for the packages stored in: /Users/anhtu/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-e32ad6e8-eba6-4cb6-a783-8ccf8e487985;1.0
	confs: [default]


:: loading settings :: url = jar:file:/Users/anhtu/Code/example/crypto-cloud/venv/spark_jobs/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 87ms :: artifacts dl 3ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   3   |   0   |   0   |   0   ||   3   |   0   |
	---------------------------------------------------------------------
:: retrieving :: org.apache.spark#spark-submit-parent-e32ad6e8-eba6-4cb6-a783-8ccf8e487985
	confs: [default]
	

In [6]:
s3_url = upload_to_s3(DATA_LAKE_BUCKET, csv_path)

schema = types.StructType(
    [
        types.StructField("agg_trade_id", types.LongType(), True),
        types.StructField("price", types.DoubleType(), True),
        types.StructField("quantity", types.DoubleType(), True),
        types.StructField("first_trade_id", types.LongType(), True),
        types.StructField("last_trade_id", types.LongType(), True),
        types.StructField("timestamp", types.LongType(), True),
        types.StructField("is_buyer_maker", types.BooleanType(), True),
        types.StructField("is_best_match", types.BooleanType(), True),
    ]
)
df = spark.read.option("header", "false").schema(schema).csv(s3_url)

INFO:botocore.credentials:Found credentials in environment variables.
INFO:shared_lib.s3:Uploaded to s3://crypto-cloud-dev-650251698703-data-lake-bucket/raw_zone/ADAUSDT-aggTrades-2025-09-28.csv
25/12/05 16:01:59 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [7]:
df = df.withColumn("ingest_date", F.current_date()).withColumn(
    "ingest_timestamp", F.current_timestamp()
)

In [8]:
output_path = (
    f"s3://{DATA_LAKE_BUCKET}/landing_zone/spot/daily/aggTrades/{symbol}/{landing_date}"
)
df.write.mode("overwrite").parquet(output_path)
logger.info(f"Parquet written to: {output_path}")

INFO:__main__:Parquet written to: s3://crypto-cloud-dev-650251698703-data-lake-bucket/landing_zone/spot/daily/aggTrades/ADAUSDT/2025-09-28


In [9]:
remove_file(csv_path)
remove_file(zip_path)
logger.info("✅ Landing job completed successfully.")

INFO:shared_lib.file:/tmp/data/raw/unzipped_data/ADAUSDT-aggTrades-2025-09-28.csv removed
INFO:shared_lib.file:/tmp/data/raw/ADAUSDT-aggTrades-2025-09-28.zip removed
INFO:__main__:✅ Landing job completed successfully.


In [10]:
spark.read.parquet(output_path).show(20, truncate=False)

                                                                                

+------------+------+--------+--------------+-------------+----------------+--------------+-------------+-----------+--------------------------+
|agg_trade_id|price |quantity|first_trade_id|last_trade_id|timestamp       |is_buyer_maker|is_best_match|ingest_date|ingest_timestamp          |
+------------+------+--------+--------------+-------------+----------------+--------------+-------------+-----------+--------------------------+
|411153010   |0.781 |6016.6  |711818877     |711818882    |1759017601892334|false         |true         |2025-12-05 |2025-12-05 09:02:01.509167|
|411153011   |0.7811|4051.1  |711818883     |711818888    |1759017602554502|false         |true         |2025-12-05 |2025-12-05 09:02:01.509167|
|411153012   |0.7811|2424.3  |711818889     |711818891    |1759017602555523|false         |true         |2025-12-05 |2025-12-05 09:02:01.509167|
|411153013   |0.7812|49.1    |711818892     |711818896    |1759017602611301|false         |true         |2025-12-05 |2025-12-05 09