In [1]:
import logging
import os
import time
import zipfile

import requests
from pyspark.sql import SparkSession, types
from pyspark.sql import functions as F

In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [3]:
landing_date = "2025-09-27"
symbol = "ADAUSDT"

PROJECT_PREFIX = os.getenv("PROJECT_PREFIX")
PROJECT_PREFIX_UNDERSCORE = os.getenv("PROJECT_PREFIX_UNDERSCORE")
DATA_LAKE_BUCKET = os.getenv("DATA_LAKE_BUCKET")
ICEBERG_LOCK_TABLE = os.getenv("ICEBERG_LOCK_TABLE")

In [4]:
def download_file(url, file_name):
    if os.path.exists(file_name):
        logger.info(f"{file_name} exists")
        return
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(file_name, "wb") as f:
            for chunk in r.iter_content(chunk_size=1024 * 1024):
                if chunk:
                    f.write(chunk)
        logger.info(
            f"Downloaded {file_name} {(os.path.getsize(file_name) / (1024 * 1024)):.2f}MB completed"
        )


def remove_file(file_name):
    if os.path.exists(file_name):
        os.remove(file_name)
        logger.info(f"{file_name} removed")


def extract_file(extract_dir, zip_path):
    if not os.path.exists(zip_path):
        logger.info(f"{zip_path} not found")
        return
    if not os.path.exists(extract_dir):
        os.makedirs(extract_dir)
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extract_dir)
        csv_file = os.path.join(extract_dir, os.listdir(extract_dir)[0])
        logger.info(f"Extracted CSV: {csv_file}")
        return csv_file

In [5]:
script_dir = "./"
extract_dir = os.path.join(script_dir, "unzipped_data")
url = f"https://data.binance.vision/data/spot/daily/aggTrades/{symbol}/{symbol}-aggTrades-{landing_date}.zip"
file_name = os.path.join(script_dir, url.split("/")[-1])
logger.info(f"Downloading {url} -> {file_name}")

start_t = time.time()
download_file(url, file_name)
csv_file = extract_file(extract_dir, file_name)
end_t = time.time()
logger.info(f"Download + Extract processed in {(end_t - start_t):.3f} seconds")

INFO:__main__:Downloading https://data.binance.vision/data/spot/daily/aggTrades/ADAUSDT/ADAUSDT-aggTrades-2025-09-27.zip -> ./ADAUSDT-aggTrades-2025-09-27.zip
INFO:__main__:./ADAUSDT-aggTrades-2025-09-27.zip exists
INFO:__main__:Extracted CSV: ./unzipped_data/ADAUSDT-aggTrades-2025-09-27.csv
INFO:__main__:Download + Extract processed in 0.019 seconds


In [6]:
spark = (
    SparkSession.builder
    .appName("Landing Job")
    .master("local[*]")
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic")
    .config("spark.sql.session.timeZone", "UTC")
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config("spark.sql.columnVector.offheap.enabled", "false")
    .config("spark.memory.offHeap.enabled", "false")
    .config("spark.sql.catalog.glue_catalog.read.parquet.vectorization.enabled", "false")
    .config("spark.driver.memory", "2g")
    .config("spark.driver.extraJavaOptions", "-XX:MaxDirectMemorySize=1g")
    .config("spark.sql.codegen.wholeStage", "false")
    .config("spark.jars.packages",
            "org.apache.hadoop:hadoop-aws:3.3.4")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/Users/anhtu/.pyenv/versions/3.11.11/envs/crypto-cloud/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /Users/anhtu/.ivy2/cache
The jars for the packages stored in: /Users/anhtu/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-a56b5b86-7303-4d37-b8ed-cd732f18154e;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 86ms :: artifacts dl 3ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	----------------------------------

In [7]:
schema = types.StructType(
    [
        types.StructField("agg_trade_id", types.LongType(), True),
        types.StructField("price", types.DoubleType(), True),
        types.StructField("quantity", types.DoubleType(), True),
        types.StructField("first_trade_id", types.LongType(), True),
        types.StructField("last_trade_id", types.LongType(), True),
        types.StructField("timestamp", types.LongType(), True),
        types.StructField("is_buyer_maker", types.BooleanType(), True),
        types.StructField("is_best_match", types.BooleanType(), True),
    ]
)

df = spark.read.option("header", "false").schema(schema).csv(csv_file)

In [8]:
df = df.withColumn("ingest_date", F.current_date()).withColumn(
    "ingest_timestamp", F.current_timestamp()
)

In [9]:
output_path = (
    f"s3a://{DATA_LAKE_BUCKET}/landing_zone/spot/daily/aggTrades/{symbol}/{landing_date}"
)
df.write.mode("overwrite").parquet(output_path)
logger.info(f"Parquet written to: {output_path}")

25/11/19 23:23:20 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
INFO:__main__:Parquet written to: s3a://crypto-cloud-dev-650251698703-data-lake-bucket/landing_zone/spot/daily/aggTrades/ADAUSDT/2025-09-27


In [10]:
remove_file(csv_file)
remove_file(file_name)

INFO:__main__:./unzipped_data/ADAUSDT-aggTrades-2025-09-27.csv removed
INFO:__main__:./ADAUSDT-aggTrades-2025-09-27.zip removed


In [11]:
spark.read.parquet(output_path).show(20, truncate=False)

                                                                                

+------------+------+--------+--------------+-------------+----------------+--------------+-------------+-----------+--------------------------+
|agg_trade_id|price |quantity|first_trade_id|last_trade_id|timestamp       |is_buyer_maker|is_best_match|ingest_date|ingest_timestamp          |
+------------+------+--------+--------------+-------------+----------------+--------------+-------------+-----------+--------------------------+
|411114767   |0.7918|111.5   |711706932     |711706932    |1758931200686229|true          |true         |2025-11-19 |2025-11-19 16:23:20.973466|
|411114768   |0.7918|252.5   |711706933     |711706933    |1758931200711075|true          |true         |2025-11-19 |2025-11-19 16:23:20.973466|
|411114769   |0.7918|240.6   |711706934     |711706937    |1758931202600822|true          |true         |2025-11-19 |2025-11-19 16:23:20.973466|
|411114770   |0.7917|43.1    |711706938     |711706940    |1758931204869784|true          |true         |2025-11-19 |2025-11-19 16