In [49]:
spark.stop()

# Start Spark session

In [1]:
import os
import uuid
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from datetime import datetime, timezone, timedelta, date

In [2]:
AWS_S3_ENDPOINT = os.getenv("AWS_S3_ENDPOINT")
NESSIE_URI = os.getenv("NESSIE_URI")
WAREHOUSE = os.getenv("WAREHOUSE")
AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID")
AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY")

In [3]:
conf = (
    pyspark.SparkConf()
        .setAppName('app_name')
        .set('spark.jars.packages', ",".join([
            'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.2',
            'org.projectnessie.nessie-integrations:nessie-spark-extensions-3.5_2.12:0.104.3',
            'org.apache.iceberg:iceberg-aws-bundle:1.9.2',
            'org.postgresql:postgresql:42.5.0'
            ]))
        .set('spark.sql.extensions', ",".join([
            'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions',
            'org.projectnessie.spark.extensions.NessieSparkSessionExtensions'
        ]))
        .set('spark.sql.catalog.nessie', 'org.apache.iceberg.spark.SparkCatalog')
        .set('spark.sql.catalog.nessie.uri', NESSIE_URI)
        .set('spark.sql.catalog.nessie.client-api-version', '2')
        .set('spark.sql.catalog.nessie.ref', 'main')
        .set('spark.sql.catalog.nessie.authentication.type', 'NONE')
        .set('spark.sql.catalog.nessie.catalog-impl', 'org.apache.iceberg.nessie.NessieCatalog')
        .set('spark.sql.catalog.nessie.s3.endpoint', AWS_S3_ENDPOINT)
        .set('spark.sql.catalog.nessie.warehouse', WAREHOUSE)
        .set('spark.sql.catalog.nessie.s3.path-style-access', 'true')
        .set('spark.sql.catalog.nessie.s3.access-key-id', AWS_ACCESS_KEY_ID)
        .set('spark.sql.catalog.nessie.s3.secret-access-key', AWS_SECRET_ACCESS_KEY)
        .set('spark.sql.catalog.nessie.io-impl', 'org.apache.iceberg.aws.s3.S3FileIO')
        .set("spark.sql.iceberg.merge-schema", "true")
        .set("spark.executor.memory", "2g")
        .set("spark.driver.memory", "2g")
        .set("spark.executor.cores", "2")
)

# Start Spark Session
spark = SparkSession.builder.master("spark://spark-master:7077").config(conf=conf).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

print("Spark Running")

/usr/local/lib/python3.8/site-packages/pyspark/bin/load-spark-env.sh: line 68: ps: command not found


:: loading settings :: url = jar:file:/usr/local/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-f1accb44-bdbd-4b5d-8d85-a43abf3e1257;1.0
	confs: [default]
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.9.2 in central
	found org.projectnessie.nessie-integrations#nessie-spark-extensions-3.5_2.12;0.104.3 in central
	found org.apache.iceberg#iceberg-aws-bundle;1.9.2 in central
	found org.postgresql#postgresql;42.5.0 in central
	found org.checkerframework#checker-qual;3.5.0 in central
downloading https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-3.5_2.12/1.9.2/iceberg-spark-runtime-3.5_2.12-1.9.2.jar ...
	[SUCCESSFUL ] org

Spark Running


# DDL

In [5]:
spark.sql("USE REFERENCE main IN nessie")
spark.sql("CREATE DATABASE nessie.bronze")
spark.sql("CREATE DATABASE nessie.silver")
spark.sql("CREATE DATABASE nessie.gold")

spark.sql("""
    CREATE TABLE nessie.bronze.customer (
        id BIGINT,
        name STRING,
        sex STRING,
        mail STRING,
        birthdate DATE,
        login_username STRING,
        login_password STRING,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.location (
        id STRING,
        street_address STRING,
        city STRING,
        state STRING,
        zipcode STRING,
        country STRING,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.customer_location (
        id BIGINT,
        customer_id BIGINT,
        location_id STRING,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.customer_phone (
        id BIGINT,
        customer_id BIGINT,
        phone_number STRING,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.shadow_product (
        id STRING,
        product_id STRING,
        product_title STRING,
        currency STRING,
        price DECIMAL(10, 2),
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.category (
        id BIGINT,
        category_name STRING,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.product_category (
        id BIGINT,
        product_id STRING,
        category_id BIGINT,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

spark.sql("""
    CREATE TABLE nessie.bronze.review (
        id STRING,
        customer_id BIGINT,
        product_id STRING,
        star_rating STRING,
        helpful_votes INT,
        total_votes INT,
        marketplace STRING,
        verified_purchase STRING,
        review_headline STRING,
        review_body STRING,
        created_at DATE,
        updated_at DATE,
        _ingested_at TIMESTAMP,
        _batch_id STRING,
        _is_deleted BOOLEAN
    )
    USING iceberg
    TBLPROPERTIES ('write.spark.accept-any-schema'='true')
""")

DataFrame[]

# ETL processing

In [11]:
ts = datetime.now(timezone(timedelta(hours=7))).replace(tzinfo=None).replace(microsecond=0)
ts = datetime(2010, 1, 1, 1, 1, 1)

table_names = [row["tableName"] for row in spark.sql("SHOW TABLES IN nessie.bronze").select("tableName").collect()]
today = str(ts.date())

etl_processing_branch_name = f"feat/etl-processing-{ts.strftime('%Y-%m-%d-%H-%M-%S')}"
spark.sql(f"""
    CREATE BRANCH
    IF NOT EXISTS {etl_processing_branch_name}
    IN nessie
    FROM main
""")

DataFrame[refType: string, name: string, hash: string]

## Bronze Layer

Create new branch for etl:

    - If success, merge branch
    - If fail, nothing change

In [12]:
bronze_layer_branch_name = f"feat/bronze-layer-{ts.strftime('%Y-%m-%d-%H-%M-%S')}"
spark.sql(f"""
    CREATE BRANCH
    IF NOT EXISTS {bronze_layer_branch_name}
    IN nessie
    FROM {etl_processing_branch_name}
""")

spark.sql(f"USE REFERENCE {bronze_layer_branch_name} IN nessie;")
spark.sql("LIST REFERENCES IN nessie").show(truncate=False)

+-------+---------------------------------------+----------------------------------------------------------------+
|refType|name                                   |hash                                                            |
+-------+---------------------------------------+----------------------------------------------------------------+
|Branch |feat/bronze-layer-2010-01-01-01-01-01  |75ef377c8f6e6eab2f130210679b25982671e2843d15e212ff72d084e2f2f055|
|Branch |feat/etl-processing-2010-01-01-01-01-01|75ef377c8f6e6eab2f130210679b25982671e2843d15e212ff72d084e2f2f055|
|Branch |main                                   |75ef377c8f6e6eab2f130210679b25982671e2843d15e212ff72d084e2f2f055|
+-------+---------------------------------------+----------------------------------------------------------------+



In [13]:
def ingest(table_name: str, today: date) -> bool:
    try:
        # Lấy thời gian gần nhất mà bronze layer được update
        max_updated_at = spark.sql(f"SELECT max(updated_at) as max_updated_at FROM nessie.bronze.{table_name}").collect()[0]["max_updated_at"]
        max_updated_at = "1999-01-01" if max_updated_at is None else str(max_updated_at)
        print(f"max_updated_at: {max_updated_at}")
        
        query = f"""
        (
            SELECT 
                min(updated_at) AS min_date, 
                max(updated_at) AS max_date 
            FROM {table_name} 
            WHERE updated_at > '{max_updated_at}' AND updated_at <= '{today}'
        ) tmp
        """
        bounds = spark.read \
            .format("jdbc") \
            .option("url", "jdbc:postgresql://postgres:5432/oltp") \
            .option("dbtable", query) \
            .option("user", "postgres") \
            .option("password", "postgres") \
            .option("driver", "org.postgresql.Driver") \
            .load() \
            .collect()[0]
        
        lower, upper = bounds["min_date"], bounds["max_date"]
        print(f"lower: {lower}, upper: {upper}")
        
        # Chỉ lấy những record ở source có giá trị updated_at >= giá trị max(updated_at) ở bronze
        query = f"""
        (
            SELECT * 
            FROM {table_name} 
            WHERE updated_at > '{max_updated_at}' AND updated_at <= '{today}'
        ) AS {table_name}
        """
        df = (spark.read
            .format("jdbc")
            .option("url", "jdbc:postgresql://postgres:5432/oltp")
            .option("dbtable", query)
            .option("user", "postgres")
            .option("password", "postgres")
            .option("driver", "org.postgresql.Driver")
            .option("partitionColumn", "id")
            .option("lowerBound", lower)
            .option("upperBound", upper)
            .option("numPartitions", "8")
            .option("partitionColumn", "updated_at")
            .option("lowerBound", lower)
            .option("upperBound", upper)
            .option("numPartitions", "8")
            .option("fetchsize", "10000")
            .load()
        )
        
        # Thêm các cột cần thiết để theo dõi
        batch_id = str(uuid.uuid4())
        df_bronze = (df
          .withColumn("_ingested_at", F.lit(ts))
          .withColumn("_batch_id", F.lit(batch_id))
          .withColumn("_is_deleted", F.lit(False))
        )
        
        # Ghi vào Iceberg
        df_bronze.writeTo(f"nessie.bronze.{table_name}") \
            .option("mergeSchema", "true") \
            .append()
    except Exception as e:
        print(e)
        return False
    return True


success = True
for table_name in table_names:
    success = True if ingest(table_name=table_name, today=today) else False
    print(f"table_name: {table_name}, success: {success}")

max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-13


                                                                                

table_name: category, success: True
max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-29


                                                                                

table_name: customer, success: True
max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-29


                                                                                

table_name: customer_location, success: True
max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-29


                                                                                

table_name: customer_phone, success: True
max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-29
table_name: location, success: False
max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-17


                                                                                

table_name: product_category, success: True
max_updated_at: 1999-01-01


[Stage 35:>                                                         (0 + 1) / 1]

lower: 2009-12-13, upper: 2009-12-29


                                                                                

table_name: review, success: True
max_updated_at: 1999-01-01


                                                                                

lower: 2009-12-13, upper: 2009-12-17


                                                                                

table_name: shadow_product, success: True


In [14]:
spark.sql("USE REFERENCE main IN nessie")
spark.sql(f"MERGE BRANCH {bronze_layer_branch_name} INTO {etl_processing_branch_name} IN nessie")
spark.sql(f"DROP BRANCH {bronze_layer_branch_name} IN nessie")

DataFrame[status: string]

## Silver Layer

In [15]:
silver_layer_branch_name = f"feat/silver-layer-{ts.strftime('%Y-%m-%d-%H-%M-%S')}"
spark.sql(f"""
    CREATE BRANCH
    IF NOT EXISTS {silver_layer_branch_name}
    IN nessie
    FROM {etl_processing_branch_name};
""")

spark.sql(f"USE REFERENCE {silver_layer_branch_name} IN nessie;")
spark.sql("LIST REFERENCES IN nessie").show(truncate=False)

+-------+---------------------------------------+----------------------------------------------------------------+
|refType|name                                   |hash                                                            |
+-------+---------------------------------------+----------------------------------------------------------------+
|Branch |feat/etl-processing-2010-01-01-01-01-01|f13b382fc7e7b250bfefd261a77b6344583f2c4becc18480943f441f6c605580|
|Branch |feat/silver-layer-2010-01-01-01-01-01  |f13b382fc7e7b250bfefd261a77b6344583f2c4becc18480943f441f6c605580|
|Branch |main                                   |75ef377c8f6e6eab2f130210679b25982671e2843d15e212ff72d084e2f2f055|
+-------+---------------------------------------+----------------------------------------------------------------+



In [37]:
customer = spark.table("nessie.bronze.customer")
location = spark.table("nessie.bronze.location")
customer_location = spark.table("nessie.bronze.customer_location")
customer_phone = spark.table("nessie.bronze.customer_phone")
shadow_product = spark.table("nessie.bronze.shadow_product")
category = spark.table("nessie.bronze.category")
review = spark.table("nessie.bronze.review")
product_category = spark.table("nessie.bronze.product_category")

### Schema validation

#### customer

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Data types validation

#### customer

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Deduplicate

#### customer

In [38]:
print(f"Num of records before deduplicate: {customer.count()}")
customer = customer.drop_duplicates()
print(f"Num of records after deduplicate: {customer.count()}")

Num of records before deduplicate: 575
Num of records after deduplicate: 575


#### location

In [None]:
print(f"Num of records before deduplicate: {location.count()}")
location = location.drop_duplicates()
print(f"Num of records after deduplicate: {location.count()}")

#### customer_location

In [None]:
print(f"Num of records before deduplicate: {customer_location.count()}")
customer_location = customer_location.drop_duplicates()
print(f"Num of records after deduplicate: {customer_location.count()}")

#### customer_phone

In [None]:
print(f"Num of records before deduplicate: {customer_phone.count()}")
customer_phone = customer_phone.drop_duplicates()
print(f"Num of records after deduplicate: {customer_phone.count()}")

#### shadow_product

In [None]:
print(f"Num of records before deduplicate: {shadow_product.count()}")
shadow_product = shadow_product.drop_duplicates()
print(f"Num of records after deduplicate: {shadow_product.count()}")

#### category

In [None]:
print(f"Num of records before deduplicate: {category.count()}")
category = category.drop_duplicates()
print(f"Num of records after deduplicate: {category.count()}")

#### review

In [None]:
print(f"Num of records before deduplicate: {review.count()}")
review = review.drop_duplicates()
print(f"Num of records after deduplicate: {review.count()}")

#### product_category

In [34]:
print(f"Num of records before deduplicate: {product_category.count()}")
product_category = product_category.drop_duplicates()
print(f"Num of records after deduplicate: {product_category.count()}")

Num of records before deduplicate: 575


[Stage 67:>                                                         (0 + 1) / 1]

Num of records after deduplicate: 575


                                                                                

### Filtering

#### customer

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Missing data processing

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

In [35]:
df.show()

+--------+--------------------+---+--------------------+----------+--------------+--------------+----------+----------+-------------------+--------------------+-----------+
|      id|                name|sex|                mail| birthdate|login_username|login_password|created_at|updated_at|       _ingested_at|           _batch_id|_is_deleted|
+--------+--------------------+---+--------------------+----------+--------------+--------------+----------+----------+-------------------+--------------------+-----------+
| 8130334|      Michael Valdez|  M|zacharyalexander@...|1985-12-13|      zmarquez|  om%d+8iL1C?x|2009-12-13|2009-12-13|2010-01-01 01:01:01|0cedbf83-747a-41b...|      false|
|15938823|Dr. Melanie Ander...|  F|    cadams@gmail.com|1983-12-13|       bgarner|  :2W76-=a2ZcB|2009-12-13|2009-12-13|2010-01-01 01:01:01|0cedbf83-747a-41b...|      false|
|23830874|     Rachel Anderson|  F|christopherbraun@...|1989-12-13|     chadperez|  ;P2e&rBU#fj8|2009-12-13|2009-12-13|2010-01-01 01:01

In [32]:
df.na.drop(subset=[]).count()

300

### Loại bỏ khoảng trắng không cần thiết.

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Sửa invalid values

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Phát hiện và xử lý outliers

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Data Standardization & Normalization

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Late deduplicate

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Data Enrichment & Derived Columns

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Conformance (chuẩn chung enterprise)

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category

### Slowly Changing Dimension (SCD)

#### customer

In [None]:
df = spark.table("nessie.bronze.customer")
df.printSchema()

#### location

#### customer_location

#### customer_phone

#### shadow_product

#### category

#### review

#### product_category