In [1]:
# author: haidb
# update date: 2024-01-01 00:49:00
# v2.0.5

import psycopg2
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql import Window
from pyspark.sql.types import IntegerType, StringType
import pandas as pd
import numpy as np
import datetime
import pyspark.sql.dataframe
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import get_json_object, from_json, schema_of_json, lit, when
from pyspark.sql import Window
import pandas as pd
import numpy as np
import datetime
import traceback
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import logging
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, BinaryType, LongType, ArrayType, BooleanType, DoubleType, DecimalType
from pyspark.sql.avro.functions import from_avro, to_avro
import json
from yody_function import last_modify_time
import pytz
from yody_function.deltalake import upsert_deltalake
from yody_function.support_function import list_files_gcs, print_log, remove_file_gcs
from yody_function.bigquery import upsert_bigquery, delete_table_bigquery_from_dataframe
import time
from delta.tables import DeltaTable

In [2]:
# ENV = "dev"
ENV = "prod"
HDFS_MASTER = "gs://yody-lakehouse"
SPARK_HOME = "yarn"
timezone_07 = pytz.timezone("Asia/Ho_Chi_Minh")
max_row_per_partition = 100000

In [3]:
sparkSession = (
    SparkSession.builder.appName("fact_order_line_7")
    .master(SPARK_HOME)
    .config("spark.sql.debug.maxToStringFields", 255)
    .config("spark.sql.parquet.enableVectorizedReader", "false")
    .config(
        "spark.jars",
        "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.28.0.jar,gs://yody-lakehouse/job/jar_file/delta-core_2.12-1.0.1.jar",
    )
    .config("spark.executor.max", "2")
    .config("spark.executor.memory", "2g")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.debug.maxToStringFields", 255)
    .config("spark.databricks.delta.retentionDurationCheck.enabled", "false")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
    .getOrCreate()
)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/25 04:32:21 INFO org.apache.spark.SparkEnv: Registering MapOutputTracker
24/05/25 04:32:21 INFO org.apache.spark.SparkEnv: Registering BlockManagerMaster
24/05/25 04:32:21 INFO org.apache.spark.SparkEnv: Registering BlockManagerMasterHeartbeat
24/05/25 04:32:21 INFO org.apache.spark.SparkEnv: Registering OutputCommitCoordinator


In [4]:
if ENV == "prod":
    DATA_SOURCE = "/lading-zone/prod/prod_order_service/"
    DATA_STORE = "/dwh/prod/fact/"
    gcs_path_read_data = HDFS_MASTER + "/lading-zone/prod/prod_order_service/order_log_test/"
    gs_path_save_data = HDFS_MASTER + "/staging/prod/fact_order_line_history"
    dataset = "prod_yody_analytics"
    table_bigquery_name = "fact_sales"
else:
    DATA_SOURCE = "/lading-zone/dev/dev_order_service/"
    DATA_STORE = "/dwh/dev/fact/"
    gcs_path_read_data = HDFS_MASTER + "/lading-zone/dev/dev_order_service/order_log_test/"
    gs_path_save_data = HDFS_MASTER + "/staging/dev/fact_order_line_history"
    dataset = "dev_yody_analytics"
    table_bigquery_name = "fact_sales"

### Define

In [5]:
list_channel = [
    {"channel_id": 1, "channel_code": "pos"},
    {"channel_id": 2, "channel_code": "FB"},
    {"channel_id": 3, "channel_code": "Shopee"},
    {"channel_id": 4, "channel_code": "WEBSITE"},
    {"channel_id": 5, "channel_code": "APP"},
    {"channel_id": 13, "channel_code": "admin"},
    {"channel_id": 15, "channel_code": "lazada"},
    {"channel_id": 17, "channel_code": "tiki"},
    {"channel_id": 20, "channel_code": "tiktok"},
    {"channel_id": 21, "channel_code": "api"},
    {"channel_id": 33, "channel_code": "o2o"},
]

GROUP_DON_MUA = "đơn mua"
GROUP_DON_TRA = "đơn trả"
GROUP_DON_DOI = "đơn đổi"

fill_channel = F.when(F.col("channel_id") == 1, "pos")
for j in list_channel[1:]:
    _id, code = j["channel_id"], j["channel_code"]
    fill_channel = fill_channel.when(F.col("channel_id") == _id, code)
fill_channel = fill_channel.otherwise(None)

### Schema

In [6]:
SCHEMA_ORDER_LOG = StructType(
    [
        StructField("has_contract", BooleanType(), True),
        StructField("has_invoice", BooleanType(), True),
        StructField("id", IntegerType(), True),
        StructField("code", StringType(), True),
        StructField("version", IntegerType(), True),
        StructField("created_by", StringType(), True),
        StructField("created_name", StringType(), True),
        StructField("split_order_type", StringType(), True),
        StructField("pick_store_type", StringType(), True),
        StructField("confirm_type", StringType(), True),
        StructField("updated_by", StringType(), True),
        StructField("updated_name", StringType(), True),
        StructField("created_date", LongType(), True),
        StructField("created_on", LongType(), True),
        StructField("updated_date", LongType(), True),
        StructField("reference_code", StringType(), True),
        StructField("company_id", IntegerType(), True),
        StructField("company", StringType(), True),
        StructField("store_id", IntegerType(), True),
        StructField("financial_store_id", IntegerType(), True),
        StructField("store", StringType(), True),
        StructField("status", StringType(), True),
        StructField("source_id", IntegerType(), True),
        StructField("source", StringType(), True),
        StructField("customer_id", IntegerType(), True),
        StructField("account_code", StringType(), True),
        StructField("account", StringType(), True),
        StructField("marketer_code", StringType(), True),
        StructField("marketer", StringType(), True),
        StructField("coordinator_code", StringType(), True),
        StructField("coordinator", StringType(), True),
        StructField("assignee_code", StringType(), True),
        StructField("origin_order", StringType(), True),
        StructField("assignee", StringType(), True),
        StructField("channel_id", IntegerType(), True),
        StructField("channel_code", StringType(), True),
        StructField("note", StringType(), True),
        StructField("customer_note", StringType(), True),
        StructField("channel", StringType(), True),
        StructField("sub_status_code", StringType(), True),
        StructField("sub_status", StringType(), True),
        StructField("type", StringType(), True),
        StructField("customer_phone_number", StringType(), True),
        StructField("tags", StringType(), True),
        StructField("campaign_id", IntegerType(), True),
        StructField("total_line_amount_after_line_discount", DoubleType(), True),
        StructField("total", DoubleType(), True),
        StructField("shipping_fee_informed_to_customer", DoubleType(), True),
        StructField("delivery_fee_charged_by_platform", DoubleType(), True),
        StructField("uniform", BooleanType(), True),
        StructField("deleted", BooleanType(), True),
        StructField(
            "payments",
            ArrayType(
                StructType(
                    [
                        StructField("paid_amount", DoubleType(), True),
                        StructField("point", DoubleType(), True),
                        StructField("payment_method_code", StringType(), True),
                        StructField("deleted", BooleanType(), True),
                    ]
                ),
                True,
            ),
        ),
        StructField(
            "nts_metafields",
            ArrayType(
                StructType(
                    [
                        StructField("id", IntegerType(), True),
                        StructField("owner_resource", StringType(), True),
                        StructField("owner_id", IntegerType(), True),
                        StructField("key", StringType(), True),
                        StructField("value", StringType(), True),
                        StructField("deleted", BooleanType(), True),
                    ]
                ),
                True,
            ),
        ),
        StructField(
            "metafields",
            ArrayType(
                StructType(
                    [
                        StructField("id", IntegerType(), True),
                        StructField("owner_resource", StringType(), True),
                        StructField("owner_id", IntegerType(), True),
                        StructField("key", StringType(), True),
                        StructField("value", StringType(), True),
                        StructField("deleted", BooleanType(), True),
                    ]
                ),
                True,
            ),
        ),
        StructField(
            "discounts",
            ArrayType(
                StructType(
                    [
                        StructField("id", IntegerType(), True),
                        StructField("rate", DoubleType(), True),
                        StructField("value", DoubleType(), True),
                        StructField("amount", DoubleType(), True),
                        StructField("taxable_amount", DoubleType(), True),
                        StructField("promotion_id", IntegerType()),
                        StructField("promotion_title", StringType()),
                        StructField("discount_code", StringType()),
                        StructField("taxable", BooleanType(), True),
                        StructField("deleted", BooleanType(), True),
                    ]
                ),
                True,
            ),
        ),
        StructField(
            "fulfillments",
            ArrayType(
                StructType(
                    [
                        StructField("id", IntegerType()),
                        StructField("created_date", LongType()),
                        StructField("updated_date", LongType()),
                        StructField("status", StringType()),
                        StructField("sku", StringType()),
                        StructField("packer_code", StringType()),
                        StructField("shipped_on", LongType()),
                        StructField("deleted", BooleanType()),
                    ]
                ),
                True,
            ),
        ),
        StructField(
            "special_order",
            StructType(
                [
                    StructField("id", IntegerType()),
                    StructField("created_date", LongType()),
                    StructField("updated_date", LongType()),
                    StructField("order_id", IntegerType()),
                    StructField("order_original_code", StringType()),
                    StructField("order_carer_code", StringType()),
                    StructField("order_carer_name", StringType()),
                    StructField("order_return_code", StringType()),
                    StructField("amount", DoubleType()),
                    StructField("ecommerce", StringType()),
                    StructField("type", StringType()),
                    StructField("reason", StringType()),
                ]
            ),
            True,
        ),
        StructField(
            "shipping_address",
            StructType(
                [
                    StructField("id", IntegerType()),
                    StructField("order_id", IntegerType()),
                    StructField("ward_id", IntegerType()),
                    StructField("full_address", StringType()),
                    StructField("deleted", BooleanType()),
                ]
            ),
            True,
        ),
        StructField(
            "reason",
            StructType(
                [
                    StructField("id", IntegerType()),
                    StructField("code", StringType()),
                    StructField("name", StringType()),
                    StructField("deleted", BooleanType()),
                ]
            ),
            True,
        ),
        StructField(
            "sub_reason",
            StructType(
                [
                    StructField("id", IntegerType()),
                    StructField("code", StringType()),
                    StructField("name", StringType()),
                    StructField("deleted", BooleanType()),
                ]
            ),
            True,
        ),
        StructField(
            "utm_tracking",
            StructType(
                [
                    StructField("id", IntegerType()),
                    StructField("created_date", LongType()),
                    StructField("updated_date", LongType()),
                    StructField("order_id", IntegerType()),
                    StructField("utm_id", StringType()),
                    StructField("utm_source", StringType()),
                    StructField("utm_medium", StringType()),
                    StructField("utm_campaign", StringType()),
                    StructField("utm_term", StringType()),
                    StructField("utm_content", StringType()),
                    StructField("deleted", BooleanType()),
                ]
            ),
            True,
        ),
        StructField(
            "items",
            ArrayType(
                StructType(
                    [
                        StructField("id", IntegerType()),
                        StructField("created_date", LongType()),
                        StructField("updated_date", LongType()),
                        StructField("order_id", IntegerType()),
                        StructField("order_return_id", IntegerType()),
                        StructField("sku", StringType()),
                        StructField("variant_id", IntegerType()),
                        StructField("product_id", IntegerType()),
                        StructField("quantity", IntegerType()),
                        StructField("price", DoubleType()),
                        StructField("amount", DoubleType()),
                        StructField("line_amount_after_line_discount", DoubleType()),
                        StructField("discount_rate", DoubleType()),
                        StructField("discount_value", DoubleType()),
                        StructField("discount_amount", DoubleType()),
                        StructField("total_tax_line", DoubleType()),
                        StructField("taxable", BooleanType()),
                        StructField("distributed_order_discount", DoubleType()),
                        StructField("deleted", BooleanType()),
                        StructField(
                            "discount_items",
                            ArrayType(
                                StructType(
                                    [
                                        StructField("id", IntegerType()),
                                        StructField("rate", DoubleType()),
                                        StructField("value", DoubleType()),
                                        StructField("promotion_id", IntegerType()),
                                        StructField("promotion_title", StringType()),
                                        StructField("discount_code", StringType()),
                                        StructField("amount", DoubleType()),
                                        StructField("type", StringType()),
                                        StructField("taxable", BooleanType()),
                                        StructField("deleted", BooleanType()),
                                    ]
                                ),
                                True,
                            ),
                        ),
                    ]
                ),
                True,
            ),
        ),
        StructField(
            "order_returns",
            ArrayType(
                StructType(
                    [
                        StructField("id", IntegerType()),
                        StructField("created_date", LongType()),
                        StructField("updated_date", LongType()),
                        StructField("created_on", LongType()),
                        StructField("created_by", StringType()),
                        StructField("updated_by", StringType()),
                        StructField("created_name", StringType()),
                        StructField("updated_name", StringType()),
                        StructField("marketer_code", StringType()),
                        StructField("code", StringType()),
                        StructField("order_id", IntegerType()),
                        StructField("order_type", StringType()),
                        StructField("reference_code", StringType()),
                        StructField("company_id", IntegerType()),
                        StructField("company", StringType()),
                        StructField("store_id", IntegerType()),
                        StructField("returned_store_id", IntegerType()),
                        StructField("status", StringType()),
                        StructField("source_id", IntegerType()),
                        StructField("financial_store_id", IntegerType()),
                        StructField("financial_source_id", IntegerType()),
                        StructField("source", StringType()),
                        StructField("customer_id", IntegerType()),
                        StructField("account_code", StringType()),
                        StructField("assignee_code", StringType()),
                        StructField("sub_status", StringType()),
                        StructField("sub_status_code", StringType()),
                        StructField("channel_id", IntegerType()),
                        StructField("channel", StringType()),
                        StructField("channel_code", StringType(), True),
                        StructField("payment_status", StringType()),
                        StructField("tags", StringType(), True),
                        StructField("total_line_amount_after_line_discount", DoubleType()),
                        StructField("customer_phone_number", StringType()),
                        StructField("total", DoubleType()),
                        StructField("total_quantities", IntegerType()),
                        StructField("total_discount", DoubleType()),
                        StructField("total_tax", DoubleType()),
                        StructField("receive_date", LongType()),
                        StructField("point_refund", IntegerType()),
                        StructField("money_refund", DoubleType()),
                        StructField("order_exchange_id", IntegerType()),
                        StructField(
                            "sub_reason",
                            StructType(
                                [
                                    StructField("id", IntegerType()),
                                    StructField("code", StringType()),
                                    StructField("name", StringType()),
                                    StructField("deleted", BooleanType()),
                                ]
                            ),
                            True,
                        ),
                        StructField(
                            "reason",
                            StructType(
                                [
                                    StructField("id", IntegerType()),
                                    StructField("code", StringType()),
                                    StructField("name", StringType()),
                                    StructField("deleted", BooleanType()),
                                ]
                            ),
                            True,
                        ),
                        StructField(
                            "items",
                            ArrayType(
                                StructType(
                                    [
                                        StructField("id", IntegerType()),
                                        StructField("created_date", LongType()),
                                        StructField("updated_date", LongType()),
                                        StructField("order_id", IntegerType()),
                                        StructField("order_return_id", IntegerType()),
                                        StructField("sku", StringType()),
                                        StructField("variant_id", IntegerType()),
                                        StructField("product_id", IntegerType()),
                                        StructField("quantity", IntegerType()),
                                        StructField("price", DoubleType()),
                                        StructField("amount", DoubleType()),
                                        StructField("line_amount_after_line_discount", DoubleType()),
                                        StructField("discount_rate", DoubleType()),
                                        StructField("discount_value", DoubleType()),
                                        StructField("discount_amount", DoubleType()),
                                        StructField("total_tax_line", DoubleType()),
                                        StructField("taxable", BooleanType()),
                                        StructField("distributed_order_discount", DoubleType()),
                                        StructField("deleted", BooleanType()),
                                        StructField(
                                            "discount_items",
                                            ArrayType(
                                                StructType(
                                                    [
                                                        StructField("id", IntegerType()),
                                                        StructField("rate", DoubleType()),
                                                        StructField("value", DoubleType()),
                                                        StructField("promotion_id", IntegerType()),
                                                        StructField("promotion_title", StringType()),
                                                        StructField("discount_code", StringType()),
                                                        StructField("amount", DoubleType()),
                                                        StructField("type", StringType()),
                                                        StructField("taxable", BooleanType()),
                                                        StructField("deleted", BooleanType()),
                                                    ]
                                                ),
                                                True,
                                            ),
                                        ),
                                    ]
                                ),
                                True,
                            ),
                        ),
                        StructField(
                            "discounts",
                            ArrayType(
                                StructType(
                                    [
                                        StructField("id", IntegerType(), True),
                                        StructField("rate", DoubleType(), True),
                                        StructField("value", DoubleType(), True),
                                        StructField("amount", DoubleType(), True),
                                        StructField("taxable_amount", DoubleType(), True),
                                        StructField("promotion_id", IntegerType(), True),
                                        StructField("promotion_title", StringType()),
                                        StructField("discount_code", StringType()),
                                        StructField("taxable", BooleanType(), True),
                                        StructField("deleted", BooleanType(), True),
                                    ]
                                ),
                                True,
                            ),
                        ),
                        StructField(
                            "payments",
                            ArrayType(
                                StructType(
                                    [
                                        StructField("created_date", LongType()),
                                        StructField("paid_amount", DoubleType(), True),
                                        StructField("payment_method_code", StringType(), True),
                                        StructField("status", StringType(), True),
                                        StructField("deleted", BooleanType(), True),
                                    ]
                                ),
                                True,
                            ),
                        ),
                    ]
                ),
                True,
            ),
        ),
    ]
)

In [7]:
SCHEMA_FACT_ORDER_LINE = [
    col("order_type").cast(StringType()),
    col("order_status").cast(StringType()),
    col("order_source").cast(StringType()),
    col("reference_code").cast(StringType()),
    col("source_store_key").cast(StringType()),
    col("sub_status_code").cast(StringType()),
    col("created_by").cast(StringType()),
    col("channel_code").cast(StringType()),
    col("is_uniform").cast(StringType()),
    col("account_code").cast(StringType()),
    col("assignee_code").cast(StringType()),
    col("sub_status").cast(StringType()),
    col("customer_phone_number").cast(StringType()),
    col("marketer_code").cast(StringType()),
    col("coordinator_code").cast(StringType()),
    col("order_code").cast(StringType()),
    col("price_rule_order_line_title").cast(StringType()),
    col("price_rule_order_title").cast(StringType()),
    col("reason").cast(StringType()),
    col("note").cast(StringType()),
    col("customer_note").cast(StringType()),
    col("discount_code_order_line").cast(StringType()),
    col("order_tags").cast(StringType()),
    col("discount_code_order").cast(StringType()),
    col("order_id").cast(LongType()),
    col("order_line_id").cast(LongType()),
    col("variant_id").cast(LongType()),
    col("inventory_id").cast(LongType()),
    col("order_date_key_07").cast(LongType()),
    col("source_store_id").cast(LongType()),
    col("inventory_key").cast(LongType()),
    col("financial_store_id").cast(LongType()),
    col("financial_source_store_key").cast(StringType()),
    col("accounting_source_store_key").cast(StringType()),
    col("operation_source_store_key").cast(StringType()),
    col("order_return_id").cast(LongType()),
    col("sub_reason_id").cast(LongType()),
    col("order_finished_date_key_07").cast(LongType()),
    col("out_of_stock").cast(LongType()),
    col("quantity").cast(LongType()),
    col("campaign_key").cast(LongType()),
    col("customer_id").cast(LongType()),
    col("customer_key").cast(LongType()),
    col("price_rule_order_line_id").cast(LongType()),
    col("price_rule_order_id").cast(LongType()),
    col("total").cast(DecimalType(19, 2)),
    col("total_after_discount").cast(DecimalType(19, 2)),
    col("total_discount_amount").cast(DecimalType(19, 2)),
    col("price_no_vat").cast(DecimalType(19, 2)),
    col("amount_no_vat").cast(DecimalType(19, 2)),
    col("product_discount").cast(DecimalType(19, 2)),
    col("distributed_shipping_after_point").cast(DecimalType(19, 2)),
    col("distributed_point").cast(DecimalType(19, 2)),
    col("distributed_ship_value").cast(DecimalType(19, 2)),
    col("distributed_order_discount").cast(DecimalType(19, 2)),
    col("tax_line").cast(DecimalType(19, 2)),
    col("product_amount_after_discount").cast(DecimalType(19, 2)),
    col("order_amount_after_discount").cast(DecimalType(19, 2)),
    col("net_amount_no_vat").cast(DecimalType(19, 2)),
    col("price").cast(DecimalType(19, 2)),
    col("price_in_order").cast(DecimalType(21, 2)),
    col("paid_amount").cast(DecimalType(19, 2)),
    col("cogs_price").cast(DecimalType(19, 2)),
    col("cogs_price_v1").cast(DecimalType(19, 2)),
    col("cogs_price_v2").cast(DecimalType(21, 2)),
    col("order_created_time_07").cast(TimestampType()),
    col("order_finished_time_07").cast(TimestampType()),
    col("__updated_at").cast(TimestampType()),
    col("order_log_id").cast(LongType()),
    col("is_deleted").cast(IntegerType()),
    col("referral_phone_number").cast(StringType()),
    col("special_order_type").cast(StringType()),
    col("special_order_carer_code").cast(StringType()),
    col("special_order_carer_name").cast(StringType()),
    col("special_order_original_code").cast(StringType()),
    col("special_order_return_code").cast(StringType()),
    col("special_order_amount").cast(DecimalType(19, 2)),
    col("special_order_reason").cast(StringType()),
    col("special_order_e_commerce").cast(StringType()),
    col("order_group").cast(StringType()),
    col("transaction_type").cast(StringType()),
    col("packer_code").cast(StringType()),
    col("origin_order_code").cast(StringType()),
    col("is_accessory_suggestion").cast(BooleanType()),
    col("company_code").cast(StringType()),
    col("has_invoice").cast(IntegerType()),
    col("has_contract").cast(IntegerType()),
    col("distributed_fee_marketplace").cast(DecimalType(19, 2)),
    col("split_order_type").cast(StringType()),
    col("pick_store_type").cast(StringType()),
    col("confirm_type").cast(StringType())
]

SCHEMA_SAVE_ADD = [
    col("utm_id").cast("string"),
    col("utm_source").cast("string"),
    col("utm_medium").cast("string"),
    col("utm_campaign").cast("string"),
    col("utm_term").cast("string"),
    col("utm_content").cast("string"),
    col("ward_id").cast("int"),
    col("shipping_address_id").cast("int"),
    col("shipping_full_address").cast("string"),
]

### Logic ETL

In [8]:
def base(df_raw):
    df_base = last_modify_time(df_raw, "root_id", "id")
    df_base = (
        df_base.withColumn("c", from_json(F.col("data"), SCHEMA_ORDER_LOG))
        .drop("data")
        .withColumn("items", F.col("c.items"))
        .withColumn("utm_tracking", F.col("c.utm_tracking"))
        .withColumn("order_returns", F.col("c.order_returns"))
        .withColumn("is_uniform", F.col("c.uniform"))
        .withColumn("__updated_at", F.col("created_date") + F.expr("INTERVAL 7 HOURS"))
        .withColumn("nts_metafields", F.coalesce(F.col("c.nts_metafields"), F.col("c.metafields")))
        .withColumn("origin_order_code", F.coalesce(F.col("c.origin_order"), F.lit("")))
        .withColumn("has_contract", F.col("c.has_contract"))
        .withColumn("has_invoice", F.col("c.has_invoice"))
        .withColumn("delivery_fee_charged_by_platform", F.coalesce(F.col("c.delivery_fee_charged_by_platform"), F.lit(0.0)))
        .withColumn(
            "referral_phone_number",
            F.coalesce(
                F.expr("aggregate(filter(nts_metafields, x -> ((x.deleted == 0) and (upper(x.key) == 'REFERRAL'))), '', (acc, x) -> x.value)"),
                F.lit(""),
            ),
        )
        .withColumn(
            "is_accessory_suggestion",
            F.coalesce(
                F.expr(
                    "aggregate(filter(nts_metafields, x -> ((x.deleted == 0) and (upper(x.key) == 'IS_ACCESSORY_SUGGESTION'))), 'false', (acc, x) -> x.value)"
                ),
                F.lit("false"),
            ),
        )
        .withColumn("company_code", F.lit("YODY"))
    )

    return df_base

#### Đơn tạo

In [9]:
def don_tao(df_base):
    # tranform
    cond_out_of_stock_true = (col("order_status") == lit("cancelled")) & (col("sub_status_code") == lit("out_of_stock"))
    cond_out_of_stock_false = col("order_status") == lit("finished")
    cond_out_of_stock_false_return = col("order_status") == lit("finished")

    # lay cac thong tin can thiet (trai phang)
    df_don_tao_order = (
        df_base.drop(*["order_returns"])
        .withColumnRenamed("id", "order_log_id")
        .withColumnRenamed("is_deleted", "orders_deleted")
        .withColumn(
            "amount", F.aggregate(F.filter(F.col("c.discounts"), lambda x: x.deleted == 0), F.lit(0).cast("double"), lambda acc, x: acc + x.amount)
        )
        .withColumn("payments", F.element_at(F.expr("filter(c.payments, d -> !d.deleted)"), -1))
        .withColumn("fulfillments", F.element_at(F.expr("filter(c.fulfillments, d -> (!d.deleted) and (d.shipped_on is not null))"), -1))
        .withColumn(
            "fulfillments_active",
            F.element_at(F.expr("filter(c.fulfillments, d -> (!d.deleted) and (d.status != 'cancelled') and (d.packer_code is not null))"), -1),
        )
        .withColumn("source", F.col("c.source"))
        .withColumn("order_status", F.col("c.status"))
        .withColumn("store_id", F.col("c.store_id"))
        .withColumn("source_id", F.col("c.source_id"))
        .withColumn("source", F.col("c.source"))
        .withColumn("channel_id", F.col("c.channel_id"))
        .withColumn("channel_code", F.upper(F.coalesce(F.col("c.channel_code"), fill_channel)))
        .withColumn("channel", F.col("c.channel"))
        .withColumn("financial_store_id", F.upper(F.col("c.financial_store_id")))
        .withColumn("transaction_type", F.coalesce(F.upper(F.col("c.type")), F.lit("")))
        .withColumn("reference_code", F.coalesce(F.col("c.reference_code"), F.lit("")))
        .withColumn("created_by", F.upper(F.col("c.created_by")))
        .withColumn("order_id", F.col("c.id"))
        .withColumn("sub_status_code", F.col("c.sub_status_code"))
        .withColumn("assignee_code", F.upper(F.col("c.assignee_code")))
        .withColumn("account_code", F.upper(F.col("c.account_code")))
        .withColumn("sub_status", F.col("c.sub_status"))
        .withColumn("inventory_id", F.col("c.store_id"))
        .withColumn("inventory_key", F.col("c.store_id"))
        .withColumn("customer_phone_number", F.col("c.customer_phone_number"))
        .withColumn("marketer_code", F.upper(F.col("c.marketer_code")))
        .withColumn("coordinator_code", F.upper(F.col("c.coordinator_code")))
        .withColumn("order_code", F.col("c.code"))
        .withColumn("campaign_key", F.col("c.campaign_id"))
        .withColumn("customer_id", F.col("c.customer_id"))
        .withColumn("customer_key", F.col("c.customer_id"))
        .withColumn("order_amount_after_discount", F.coalesce(F.col("c.total_line_amount_after_line_discount"), F.lit(0)))
        .withColumn("order_tags", F.col("c.tags"))
        .withColumn("reason", F.col("c.reason.name"))
        .withColumn("sub_reason_id", F.col("c.sub_reason.id"))
        .withColumn("note", F.col("c.note"))
        .withColumn("customer_note", F.col("c.customer_note"))
        .withColumn("total_line_amount_after_line_discount", F.coalesce(F.col("c.total_line_amount_after_line_discount"), F.lit(0)))
        .withColumn("shipping_fee_informed_to_customer", F.coalesce(F.col("c.shipping_fee_informed_to_customer"), F.lit(0)))
        .withColumn("shipped_on", (F.col("fulfillments.shipped_on") / 1000).cast("timestamp"))
        .withColumn(
            "order_created_time_07",
            F.coalesce(
                F.from_utc_timestamp(F.to_timestamp(F.col("c.created_on") / 1000), timezone_07.zone),
                F.from_utc_timestamp(F.to_timestamp(F.col("c.created_date") / 1000), timezone_07.zone),
            ),
        )
        .withColumn(
            "paid_amount_point",
            F.coalesce(
                F.expr(
                    "aggregate(filter(c.payments, x -> ((x.payment_method_code == 'point') and (x.deleted == 0) and (x.paid_amount != 0))), double(0), (acc, x) -> acc + double(x.paid_amount))"
                ),
                F.lit(0),
            ),
        )
        .withColumn(
            "total_quantities",
            F.coalesce(F.expr("aggregate(filter(items, x -> ((x.deleted != 1))), double(0), (acc, x) -> acc + double(x.quantity))"), F.lit(0)),
        )
        .withColumn("packer_code", F.coalesce(F.upper(F.col("fulfillments_active.packer_code")), F.lit("")))
        .withColumn("order_return_id", F.lit(-1))
        .withColumn("order_group", F.lit(GROUP_DON_MUA))
        .withColumn("split_order_type", F.coalesce(F.col("c.split_order_type"), F.lit("")))
        .withColumn("pick_store_type", F.coalesce(F.col("c.pick_store_type"), F.lit("")))
        .withColumn("confirm_type", F.coalesce(F.col("c.confirm_type"), F.lit("")))
    )

    # add special order
    df_don_tao_order = (
        df_don_tao_order.withColumn("special_order_type", F.coalesce(F.col("c.special_order.type"), F.lit("")))
        .withColumn("special_order_carer_code", F.coalesce(F.col("c.special_order.order_carer_code"), F.lit("")))
        .withColumn("special_order_carer_name", F.coalesce(F.col("c.special_order.order_carer_name"), F.lit("")))
        .withColumn("special_order_original_code", F.coalesce(F.col("c.special_order.order_original_code"), F.lit("")))
        .withColumn("special_order_return_code", F.coalesce(F.col("c.special_order.order_return_code"), F.lit("")))
        .withColumn("special_order_amount", F.coalesce(F.col("c.special_order.amount"), F.lit(0)))
        .withColumn("special_order_reason", F.coalesce(F.col("c.special_order.reason"), F.lit("")))
        .withColumn("special_order_e_commerce", F.coalesce(F.col("c.special_order.ecommerce"), F.lit("")))
    )

    ### add shipping
    df_don_tao_order = (
        df_don_tao_order.withColumn("ward_id", F.coalesce(F.col("c.shipping_address.ward_id"), F.lit(-1)))
        .withColumn("shipping_address_id", F.coalesce(F.col("c.shipping_address.id"), F.lit(-1)))
        .withColumn("shipping_full_address", F.coalesce(F.col("c.shipping_address.full_address"), F.lit("")))
    )

    ## tinh toan lv2
    df_don_tao_order = (
        df_don_tao_order.drop(*["c", "payments", "fulfillments", "discounts"])
        .withColumn("order_source", F.when(col("source") == "POS", "OFFLINE").when(col("source").like("%Ecom%"), "ECOM").otherwise("ONLINE"))
        .withColumn(
            "order_type",
            F.when(F.upper(col("channel")).isin(["POS", "O2O"]), "OFFLINE").when(col("channel").startswith("Web"), "ECOM").otherwise("ONLINE"),
        )
        .withColumn(
            "accounting_source_store_id",
            F.when(F.upper(col("channel")) == "POS", col("store_id"))
            .when(F.upper(col("channel")) == "O2O", col("financial_store_id"))
            .otherwise(col("source_id")),
        )
        .withColumn(
            "accounting_source_store_key",
            F.when(col("order_type") == "OFFLINE", F.concat(F.lit("ST"), col("accounting_source_store_id").cast(StringType()))).otherwise(
                F.concat(F.lit("SO"), col("accounting_source_store_id").cast(StringType()))
            ),
        )
        .withColumn("out_of_stock", F.when(cond_out_of_stock_true, True).when(cond_out_of_stock_false, False).otherwise(None))
        .withColumn("order_date_key_07", F.date_format(F.col("order_created_time_07"), "yyyyMMdd").cast(LongType()))
        .withColumn("order_finished_time_07", F.from_utc_timestamp(F.col("shipped_on"), timezone_07.zone))
        .withColumn("order_finished_date_key_07", F.date_format(F.col("order_finished_time_07"), "yyyyMMdd").cast(LongType()))
        .withColumn("total_line_full_discount", F.col("order_amount_after_discount") - F.col("amount"))
        .withColumn(
            "shipping_after_point",
            F.when(
                F.col("paid_amount_point") > F.col("total_line_full_discount"), F.col("paid_amount_point") - F.col("total_line_full_discount")
            ).otherwise(0),
        )
        .withColumn(
            "paid_amount_point",
            F.when(F.col("paid_amount_point") <= F.col("total_line_full_discount"), F.col("paid_amount_point")).otherwise(
                F.col("total_line_full_discount")
            ),
        )
        .withColumn(
            "average_discount_raw",
            F.when(F.col("total_line_amount_after_line_discount") == 0, 0).otherwise(
                F.col("amount") / F.col("total_line_amount_after_line_discount")
            ),
        )
        .withColumn(
            "average_ship_raw",
            F.when(F.col("total_quantities") == 0, 0).otherwise(F.col("shipping_fee_informed_to_customer") / F.col("total_quantities")),
        )
        .withColumn(
            "average_delivery_fee_charged_by_platform",
            F.when(F.col("total_quantities") == 0, 0).otherwise(F.col("delivery_fee_charged_by_platform") / F.col("total_quantities")),
        )
        .withColumn(
            "average_point_raw",
            F.when(F.col("total_line_amount_after_line_discount") == 0, 0).otherwise(
                F.col("paid_amount_point") / F.col("total_line_amount_after_line_discount")
            ),
        )
        .withColumn(
            "average_shipping_after_point",
            F.when(F.col("total_line_amount_after_line_discount") == 0, 0).otherwise(
                F.col("shipping_after_point") / F.col("total_line_amount_after_line_discount")
            ),
        )
    )
    ## lv3
    df_don_tao_order = df_don_tao_order.withColumn(
        "_final_update_time", F.coalesce(F.col("order_finished_time_07"), F.col("order_created_time_07"))
    ).withColumn("operation_source_store_key", F.col("accounting_source_store_key"))

    # phan chung logic
    df_don_tao_order = chi_so_cung_logic(df_don_tao_order, 1)
    return df_don_tao_order

#### Đơn trả

In [10]:
def don_tra(df_base):
    # tranform
    cond_out_of_stock_false_return = col("order_status") == lit("finished")
    lambda_payment_created_date = (
        lambda x: (x.deleted == 0) & (F.lower(x.status) == "paid") & (F.lower(x.payment_method_code).isin("point_refund", "point"))
    )

    # lay cac thong tin can thiet (trai phang)
    df_don_tra = (
        df_base.withColumnRenamed("id", "order_log_id")
        .withColumn("order_channel_code", F.upper(col("c.channel_code")))
        .withColumn("order_source_id", col("c.source_id"))
        .withColumn("reference_code", F.coalesce(F.col("c.reference_code"), F.lit("")))
        .drop(*["items"])
        .select(
            "__updated_at",
            "is_uniform",
            "reference_code",
            "order_channel_code",
            "order_source_id",
            "order_log_id",
            "referral_phone_number",
            "utm_tracking",
            "origin_order_code",
            "company_code",
            "has_invoice",
            "has_contract",
            F.explode(F.col("order_returns")).alias("or"),
        )
        .withColumn("order_group", F.when(col("or.order_exchange_id").isNull(), GROUP_DON_TRA).otherwise(GROUP_DON_DOI))
        .withColumn(
            "amount", F.aggregate(F.filter(F.col("or.discounts"), lambda x: x.deleted == 0), F.lit(0).cast("double"), lambda acc, x: acc + x.amount)
        )
        .withColumn("first_payment", F.element_at(F.filter(F.col("or.payments"), lambda_payment_created_date), 1))
        .withColumn(
            "first_payment_created_date_07", F.from_utc_timestamp(F.to_timestamp(F.col("first_payment.created_date") / 1000), timezone_07.zone)
        )
        .withColumn("order_tags", F.col("or.tags"))
        .withColumn("financial_store_id", F.col("or.financial_store_id"))
        .withColumn("financial_source_id", F.col("or.financial_source_id"))
        .withColumn("source_id", F.coalesce(col("or.source_id"), col("order_source_id")))
        .withColumn("source", F.col("or.source"))
        .withColumn("store_id_tranf", F.coalesce(F.col("or.financial_store_id"), F.col("or.store_id")))
        .withColumn("order_return_id", F.col("or.id"))
        .withColumn(
            "paid_amount_point",
            F.coalesce(
                F.expr(
                    "aggregate(filter(or.payments, x -> ((x.payment_method_code == 'point_refund') and (x.deleted == 0))), double(0), (acc, x) -> acc + double(x.paid_amount))"
                ),
                F.lit(0),
            ),
        )
        .withColumn(
            "first_payment_created_date",
            F.coalesce(F.expr("aggregate(filter(or.payments, x -> (x.deleted == 0)), double(0), (acc, x) -> acc + double(x.paid_amount))"), F.lit(0)),
        )
        .withColumn("order_created_time", F.when(F.col("or.created_on").isNull(), F.col("or.created_date")).otherwise(F.col("or.created_on")))
        .withColumn("order_id", F.col("or.order_id"))
        .withColumn("order_code", F.col("or.code"))
        .withColumn("order_status", F.col("or.status"))
        .withColumn("channel_id", F.col("or.channel_id"))
        .withColumn("channel_code", F.upper(F.coalesce(F.col("or.channel_code"), fill_channel)))
        .withColumn("channel", F.col("or.channel"))
        .withColumn("inventory_id", F.col("or.store_id"))
        .withColumn("store_id", F.col("or.store_id"))
        .withColumn("inventory_key", F.col("inventory_id"))
        .withColumn("customer_id", F.col("or.customer_id"))
        .withColumn("customer_key", F.col("or.customer_id"))
        .withColumn("campaign_key", F.lit(None))
        .withColumn("transaction_type", F.coalesce(F.upper(F.col("or.order_type")), F.lit("")))
        .withColumn("receive_date_07", F.from_utc_timestamp(F.to_timestamp(F.col("or.receive_date") / 1000), timezone_07.zone))
        .withColumn("total_line_amount_after_line_discount", F.coalesce(F.col("or.total_line_amount_after_line_discount"), F.lit(0)))
        .withColumn("sub_status", F.lit(""))
        .withColumn("coordinator_code", F.lit(""))
        .withColumn("sub_status_code", F.lit(""))
        .withColumn("created_by", F.upper(F.col("or.created_by")))
        .withColumn("assignee_code", F.upper(F.col("or.assignee_code")))
        .withColumn("account_code", F.upper(F.col("or.account_code")))
        .withColumn("customer_phone_number", F.col("or.customer_phone_number"))
        .withColumn("marketer_code", F.col("or.marketer_code"))
        .withColumn("average_shipping_after_point", F.lit(0))
        .withColumn("sub_reason_id", F.col("or.sub_reason.id"))
        .withColumn("reason", F.col("or.reason.name"))
        .withColumn("note", F.lit(""))
        .withColumn("customer_note", F.lit(""))
        .withColumn("split_order_type", F.lit(""))
        .withColumn("pick_store_type", F.lit(""))
        .withColumn("confirm_type", F.lit(""))
        .withColumn("is_accessory_suggestion", F.lit(False))
        .withColumn("items", F.col("or.items"))
        .drop(*["or"])
    )

    # add special
    df_don_tra = (
        df_don_tra.withColumn("special_order_type", F.lit(""))
        .withColumn("special_order_carer_code", F.lit(""))
        .withColumn("special_order_carer_name", F.lit(""))
        .withColumn("special_order_original_code", F.lit(""))
        .withColumn("special_order_return_code", F.lit(""))
        .withColumn("special_order_amount", F.lit(0))
        .withColumn("special_order_reason", F.lit(""))
        .withColumn("special_order_e_commerce", F.lit(""))
        .withColumn("packer_code", F.lit(""))
    )

    # shipping
    df_don_tra = (
        df_don_tra.withColumn("ward_id", F.lit(-1)).withColumn("shipping_address_id", F.lit(-1)).withColumn("shipping_full_address", F.lit(""))
    )

    # order_return
    df_don_tra = (
        df_don_tra.withColumn("order_created_time_07", F.from_utc_timestamp(F.to_timestamp(F.col("order_created_time") / 1000), timezone_07.zone))
        .withColumn("order_date_key_07", F.date_format(F.col("order_created_time_07"), "yyyyMMdd").cast(LongType()))
        .withColumn("order_finished_time_07", F.col("receive_date_07"))
        .withColumn("order_finished_date_key_07", F.date_format(F.col("order_finished_time_07"), "yyyyMMdd").cast(LongType()))
        .withColumn(
            "order_type",
            F.when(F.upper(col("channel")).isin(["POS", "O2O"]), "OFFLINE").when(col("channel").startswith("Web"), "ECOM").otherwise("ONLINE"),
        )
        .withColumn("out_of_stock", F.when(cond_out_of_stock_false_return, False).otherwise(None))
        .withColumn(
            "average_point_raw",
            F.when(F.col("total_line_amount_after_line_discount") == 0, 0).otherwise(
                F.col("paid_amount_point") / F.col("total_line_amount_after_line_discount")
            ),
        )
        .withColumn("order_amount_after_discount", F.col("total_line_amount_after_line_discount"))
        .withColumn("average_ship_raw", F.lit(0.0))
        .withColumn("order_source", F.when(col("source") == "POS", "OFFLINE").when(col("source").like("%Ecom%"), "ECOM").otherwise("ONLINE"))
        .withColumn(
            "average_discount_raw",
            F.when(F.col("total_line_amount_after_line_discount") == 0, 0).otherwise(
                F.col("amount") / F.col("total_line_amount_after_line_discount")
            ),
        )
        .withColumn(
            "average_point_raw",
            F.when(F.col("total_line_amount_after_line_discount") == 0, 0).otherwise(
                F.col("paid_amount_point") / F.col("total_line_amount_after_line_discount")
            ),
        )
        .withColumn("average_delivery_fee_charged_by_platform", F.lit(0.0))
    )
    # ######### build func #########
    # operation, accounting source store key
    COND_BEFORE_20231212 = col("_final_update_time") <= "2023-12-11 23:59:59"

    # accounting
    FUNC_ACCOUNTING_DON_TRA = (
        F.when(
            F.col("order_channel_code") == "O2O",
            F.when(COND_BEFORE_20231212, F.concat(F.lit("ST"), F.col("financial_store_id"))).otherwise(
                F.when(col("channel_code").isin("O2O", "POS"), F.concat(F.lit("ST"), col("store_id"))).otherwise(
                    F.concat(F.lit("SO"), col("source_id"))
                )
            ),
        )
        .when(F.upper(F.col("channel_code")) == "POS", F.concat(F.lit("ST"), F.col("store_id")))
        .otherwise(F.concat(F.lit("SO"), F.col("source_id").cast("string")))
    )
    FUNC_ACCOUNTING_DON_DOI = F.when(F.col("channel_code").isin("POS", "O2O"), F.concat(F.lit("ST"), F.col("store_id"))).otherwise(
        F.concat(F.lit("SO"), col("source_id"))
    )

    # operation
    FUN_OPERATION_DON_TRA = (
        F.when(F.upper(F.col("order_channel_code")) == "O2O", F.concat(F.lit("ST"), F.col("financial_store_id")))
        .when(F.upper(F.col("order_channel_code")) == "POS", F.concat(F.lit("ST"), F.col("financial_store_id")))
        .otherwise(F.concat(F.lit("SO"), F.col("financial_source_id").cast("string")))
    )
    FUN_OPERATION_DON_DOI = F.when(F.col("channel_code").isin("POS", "O2O"), F.concat(F.lit("ST"), F.col("store_id"))).otherwise(
        F.concat(F.lit("SO"), col("source_id"))
    )

    # ######### continue #########
    df_don_tra = (
        df_don_tra.withColumn("_final_update_time", F.coalesce(F.col("order_finished_time_07"), F.col("order_created_time_07")))
        .withColumn(
            "accounting_source_store_key", F.when(col("order_group") == GROUP_DON_TRA, FUNC_ACCOUNTING_DON_TRA).otherwise(FUNC_ACCOUNTING_DON_DOI)
        )
        .withColumn("operation_source_store_key", F.when(col("order_group") == GROUP_DON_TRA, FUN_OPERATION_DON_TRA).otherwise(FUN_OPERATION_DON_DOI))
    )

    df_don_tra = chi_so_cung_logic(df_don_tra, -1)
    return df_don_tra

#### Cùng logic

In [11]:
def chi_so_cung_logic(df, _type=1):
    LAST_DATE_2023 = "2023-12-31 23:59:59"
    # order_line
    df_temp = (
        df.select("*", F.explode(F.col("items")).alias("ol"))
        .withColumn("variant_id", F.col("ol.variant_id"))
        .join(df_product, "variant_id", "left")
        .fillna(0, subset=["retail_price"])
        .drop("items")
        .withColumn("discount_items", F.element_at(F.expr("filter(ol.discount_items, d -> !d.deleted)"), -1))
        .withColumn("price_in_order", F.col("ol.price"))
        .withColumn("price_no_vat", F.col("price_in_order") * F.lit(10 / 11))
        .withColumn(
            "price",
            F.when((F.col("price_in_order") == 0) & (F.col("order_date_key_07") >= 20230801), F.col("retail_price")).otherwise(
                F.col("price_in_order")
            ),
        )
        .withColumn("quantity", F.col("ol.quantity"))
        .withColumn("order_line_deleted", F.col("ol.deleted"))
        .withColumn("amount_no_vat", F.col("price_no_vat") * F.col("ol.quantity"))
        .withColumn("total", F.col("quantity") * F.col("price"))
        .withColumn("order_line_id", F.col("ol.id"))
        .withColumn("tax_line", F.col("ol.total_tax_line"))
        .withColumn("line_amount_after_line_discount", F.col("ol.line_amount_after_line_discount"))
        .withColumn("discount_rate", F.col("ol.discount_rate"))
        .withColumn("discount_item_type", F.col("discount_items.type"))
        .withColumn("price_rule_order_line_id", F.col("discount_items.promotion_id"))
        .withColumn("price_rule_order_line_title", F.col("discount_items.promotion_title"))
        .withColumn("discount_code_order_line", F.col("discount_items.discount_code"))
        .withColumn("taxable", F.col("discount_items.taxable").cast("int"))
        .withColumn("distributed_ship_value", F.col("average_ship_raw") * F.col("quantity"))
        .withColumn("distributed_fee_marketplace", F.col("average_delivery_fee_charged_by_platform") * F.col("quantity"))
        .withColumn("distributed_point", F.coalesce(F.col("average_point_raw") * F.col("line_amount_after_line_discount"), F.lit(0)))
        .withColumn("distributed_order_discount", F.coalesce(F.col("average_discount_raw") * F.col("line_amount_after_line_discount"), F.lit(0)))
        .withColumn("distributed_shipping_after_point", F.col("average_shipping_after_point") * F.col("line_amount_after_line_discount"))
        .withColumn("product_discount", F.col("total") - F.col("line_amount_after_line_discount"))
        .withColumn("total_discount_amount", F.col("product_discount") + F.col("distributed_point") + F.col("distributed_order_discount"))
        .withColumn("total_after_discount", F.when(F.col("total") == 0, 0).otherwise(F.col("total") - F.col("total_discount_amount")))
        .withColumn("product_amount_after_discount", F.col("line_amount_after_line_discount"))
        .withColumn("net_amount_no_vat", F.col("total_after_discount") - F.col("tax_line"))
        .withColumn("paid_amount", F.col("total_after_discount") + F.col("distributed_ship_value"))
        .withColumn("is_deleted", F.col("ol.deleted"))
        .withColumn("price_rule_order_id", F.lit(-1))
        .withColumn("price_rule_order_title", F.lit(""))
        .withColumn("discount_code_order", F.lit(""))
        .withColumn("financial_source_store_key", F.lit(""))
    )

    # utm
    df_temp = (
        df_temp.withColumn("utm_id", F.coalesce(F.col("utm_tracking.utm_id"), F.lit("")))
        .withColumn("utm_source", F.coalesce(F.col("utm_tracking.utm_source"), F.lit("")))
        .withColumn("utm_medium", F.coalesce(F.col("utm_tracking.utm_medium"), F.lit("")))
        .withColumn("utm_campaign", F.coalesce(F.col("utm_tracking.utm_campaign"), F.lit("")))
        .withColumn("utm_term", F.coalesce(F.col("utm_tracking.utm_term"), F.lit("")))
        .withColumn("utm_content", F.coalesce(F.col("utm_tracking.utm_content"), F.lit("")))
    )

    # join cogs
    condition_join_cogs = (F.col("d.variant_id") == F.col("c.variant_id")) & (
        (
            F.col("order_finished_time_07").isNull()
            & (F.col("order_created_time_07") >= F.col("start_date_key_07"))
            & (F.col("order_created_time_07") < F.col("end_date_key_07"))
        )
        | (
            F.col("order_finished_time_07").isNotNull()
            & (F.col("order_finished_time_07") >= F.col("start_date_key_07"))
            & (F.col("order_finished_time_07") < F.col("end_date_key_07"))
        )
    )

    df_temp = (
        df_temp.alias("d")
        .join(df_cogs_v2.alias("c"), condition_join_cogs, "left")
        .selectExpr("d.*", "c.mac as cogs_price_v2", "c.mac as cogs_price_v1")
        .fillna(0, subset=["cogs_price_v1", "cogs_price_v2"])
    )

    # switch cogs, source_store_key
    df_temp = (
        df_temp.withColumn(
            "source_store_key",
            F.when(F.col("_final_update_time") <= LAST_DATE_2023, F.col("accounting_source_store_key")).otherwise(
                F.col("operation_source_store_key")
            ),
        )
        .withColumn("source_store_id", F.substring(F.col("source_store_key"), 3, 255))
        .withColumn("cogs_price", F.when(F.col("_final_update_time") <= LAST_DATE_2023, F.col("cogs_price_v1")).otherwise(F.col("cogs_price_v2")))
    )

    # nhan voi -1 neu la don tra
    df_temp2 = df_temp.select(SCHEMA_FACT_ORDER_LINE + SCHEMA_SAVE_ADD)

    COLUMN_POSITVE = ["price", "price_in_order", "price_no_vat", "cogs_price_v1", "cogs_price_v2", "cogs_price"]
    if _type == -1:
        for field, dtype in df_temp2.dtypes:
            if (("decimal" in dtype) or (field == "quantity")) and (field not in COLUMN_POSITVE):
                df_temp2 = df_temp2.withColumn(field, F.col(field) * F.lit(-1))
    return df_temp2

#### Save data

In [12]:
def save_data(df_save):
    update_condition = "(source.__updated_at >= target.__updated_at)"
    query_after_upsert = """
        DELETE FROM `target`
        WHERE order_line_id IN (
            SELECT t.order_line_id FROM `target` as t
            JOIN (
                    SELECT 
                        order_id, 
                        max(__updated_at) as __updated_at
                    FROM `source` 
                    GROUP BY order_id
                ) as s ON 
                        t.order_id = s.order_id 
                        and t.__updated_at < s.__updated_at
        ) or (is_deleted = 1)
    """
    # bigquery
    df_upsert = df_save

    json_info = {"project_id": "yody-data-platform", "dataset": dataset, "table_name": table_bigquery_name}

    upsert_bigquery(
        df_upsert=df_upsert.select(SCHEMA_FACT_ORDER_LINE),
        json_info=json_info,
        columns_key=["order_line_id", "variant_id"],
        update_condition=update_condition,
        query_after_upsert=query_after_upsert,
        mode="upsert",
    )

    # save vào temp data
    datetime.datetime.now(tz=timezone_07)
    now_ = int(datetime.datetime.strftime(datetime.datetime.now(tz=timezone_07), "%Y%m%d%H%M%S"))
    cols_group = [c for c in df_upsert.columns if c not in ["order_log_id", "order_id"]]
    df_write_temp = (
        df_upsert.withColumn("data", F.struct(*[cols_group]))
        .withColumn("pyspark_time_created", F.lit(now_).cast("long"))
        .select("order_log_id", "order_id", "pyspark_time_created", "data")
        .groupby("order_log_id", "order_id", "pyspark_time_created")
        .agg(F.collect_list("data").alias("data"))
    )

    print_log("add fact_order_line_history_1day")
    n_partition = (df_write_temp.count() // max_row_per_partition) + 1
    df_write_temp.repartition(n_partition).write.option("mergeSchema", "true").partitionBy("pyspark_time_created").format("delta").mode(
        "append"
    ).save(gs_path_save_data)
    return

def run():
    print_log(f"start {ENV}")
    start = datetime.datetime.now()

    bucket_name = gcs_path_read_data.split("/")[2]
    path_filter = gcs_path_read_data.replace(f"gs://{bucket_name}/", "")
    list_files = list_files_gcs(bucket_name, path_filter)
    list_files = [l for l in list_files if (".parquet" in l) or ("_SUCCESS") in l]
    if len(list_files) > 0:
        df = sparkSession.read.parquet(*list_files)

        df_base = base(df)
        df_don_tao = don_tao(df_base)
        df_don_tra = don_tra(df_base)

        df_save = df_don_tao.union(df_don_tra).cache()
        save_data(df_save)

        # remove file xu ly xong
        remove_file_gcs(list_files)
        end = datetime.datetime.now()
        print_log("end")
        df_save.unpersist()
    else:
        print_log("no files")

### Main

In [13]:
df_product = (
    sparkSession.read.format("delta")
    .load(HDFS_MASTER + "/dwh/prod/dim/dim_product")
    .filter("product_category_lv1<>'Quà tặng'")
    .select("variant_id", "retail_price")
    .drop_duplicates(["variant_id"])
)

df_cogs_v2 = (
    sparkSession.read.format("delta")
    .load(HDFS_MASTER + DATA_STORE + "fact_cogs_v2")
    .filter("is_deleted == 0")
    .select("mac", "cogs_id", "start_date_key_07", "end_date_key_07", "variant_id")
)

                                                                                

In [14]:
# # Test
# df_temp = sparkSession.read.parquet(f"gs://yody-lakehouse/lading-zone/{ENV}/{ENV}_order_service/order_log")\
#     .filter(f"date_crawler >= '2024-02-29'")

# df_base = base(df_temp)
# df_don_tao = don_tao(df_base)
# df_don_tra = don_tra(df_base)
# df_save = df_don_tao.union(df_don_tra)
# save_data(df_save)
# df = sparkSession.read.parquet(gcs_path_read_data)
# df_base = base(df).cache()
# df_don_tao = don_tao(df_base)
# df_don_tra = don_tra(df_base)
# df_save = df_don_tao.union(df_don_tra)

In [15]:
# df_save.select("distributed_fee_marketplace", "order_id", "order_return_id","order_line_id")\
#     .filter("distributed_fee_marketplace > 0").show()

In [16]:
run()

2024-05-25 04:33:13.721 INFO - start prod


                                                                                

2024-05-25 04:33:35.779 INFO - write tempory fact_sales__ef1b5f81-6165-40a5-9c68-a91bab69caa7


                                                                                

2024-05-25 04:36:17.865 INFO - merge from yody-data-platform.prod_yody_analytics.fact_sales to yody-data-platform.prod_yody_analytics.fact_sales__ef1b5f81-6165-40a5-9c68-a91bab69caa7
run: 
 
        DELETE FROM `yody-data-platform.prod_yody_analytics.fact_sales`
        WHERE order_line_id IN (
            SELECT t.order_line_id FROM `yody-data-platform.prod_yody_analytics.fact_sales` as t
            JOIN (
                    SELECT 
                        order_id, 
                        max(__updated_at) as __updated_at
                    FROM `yody-data-platform.prod_yody_analytics.fact_sales__ef1b5f81-6165-40a5-9c68-a91bab69caa7` 
                    GROUP BY order_id
                ) as s ON 
                        t.order_id = s.order_id 
                        and t.__updated_at < s.__updated_at
        ) or (is_deleted = 1)
    
2024-05-25 04:36:37.144 INFO - delete table prod_yody_analytics.fact_sales__ef1b5f81-6165-40a5-9c68-a91bab69caa7
2024-05-25 04:36:37.732 INFO 

                                                                                

2024-05-25 04:37:05.748 INFO - File gs://yody-lakehouse/lading-zone/prod/prod_order_service/order_log_test/20240525040659_dc8ea0ae-1776-4ef5-b3e1-f5c093059442.parquet đã bị xóa
2024-05-25 04:37:05.805 INFO - File gs://yody-lakehouse/lading-zone/prod/prod_order_service/order_log_test/20240525040727_2a0dc89c-84e4-48cc-9e7f-d6c51d82eba0.parquet đã bị xóa
2024-05-25 04:37:05.864 INFO - File gs://yody-lakehouse/lading-zone/prod/prod_order_service/order_log_test/20240525040905_8674c509-d401-449e-a03b-612362b59487.parquet đã bị xóa
2024-05-25 04:37:05.934 INFO - File gs://yody-lakehouse/lading-zone/prod/prod_order_service/order_log_test/20240525040929_08b22bc9-b857-434e-993a-e92d4edbe1bf.parquet đã bị xóa
2024-05-25 04:37:05.995 INFO - File gs://yody-lakehouse/lading-zone/prod/prod_order_service/order_log_test/20240525041111_7c812878-6cfb-4625-8458-7dd66bf2902e.parquet đã bị xóa
2024-05-25 04:37:06.051 INFO - File gs://yody-lakehouse/lading-zone/prod/prod_order_service/order_log_test/20240525

In [17]:
sparkSession.stop()

In [18]:
# %%html
# <style>
# div.output_area pre {
#     white-space: pre;
# }
# </style>