In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import xml.etree.ElementTree as ET

In [0]:
xml_schema = StructType([
    StructField("invoice_number", StringType(), True),
    StructField("unique_invoice_identifier", StringType(), True),
    StructField("account_number", StringType(), True),
    StructField("invoice_hash", StringType(), True),
    StructField("invoice_date", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("customer_city", StringType(), True),
    #StructField("customer_pincode", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("company_state", StringType(), True),
    StructField("company_city", StringType(), True),
    #StructField("company_pincode", StringType(), True),
    StructField("vat_amount", IntegerType(), True),
    StructField("vat_breakdown_amount", IntegerType(), True),
    StructField("tax_exemption_code", StringType(), True),
   StructField("tax_exemption_amount", IntegerType(), True),
    StructField("invoice_line_identifier", StringType(), True),
    StructField("invoice_line_amount", IntegerType(), True),
    StructField("price", IntegerType(), True),
    StructField("invoice_start_date", StringType(), True),
    StructField("invoice_end_date", StringType(), True),
    StructField("invoice_total_amount", IntegerType(), True)
])

In [0]:
xml_df = spark.readStream \
    .format("text") \
    .load("/FileStore/tables/invoices/xml_landing") \
    .withColumn("file_name",input_file_name()) \
    .withColumn("load_time", expr("cast(current_timestamp() as timestamp)")) 

In [0]:
xml_df.display()

value,file_name,load_time
,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
123462837-SS,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
f237165c-6dc2-4695-9635-6245f8913c54,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
123-456-658,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
18-05-2024,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000
Pebbas,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,2024-07-21T07:38:55.495+0000


In [0]:
# Group by filename and aggregate XML content
xml_blocks = xml_df \
    .withColumn("block", concat_ws("\n", col("value"))) \
    .groupBy("file_name") \
    .agg(concat_ws("\n", collect_list("block")).alias("xml_content"))

# Select both file_name and xml_content columns
df = xml_blocks.select("file_name", "xml_content")


In [0]:
df.display()

file_name,xml_content
dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_05.xml,223462839-SS  g237165c-6dc2-4695-9635-6245f8913c95  123-456-680  SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==  29-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  451.0  600.0  ES  75.0  1  650.0  240.0  29-05-2024  03-06-2024  780.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0


In [0]:
# Define parsing function
def parse_xml(xml_string):
    root = ET.fromstring(xml_string)
    invoice_number = root.findtext("invoice_number")
    unique_invoice_identifier = root.findtext("unique_invoice_identifier")
    account_number = root.findtext("account_number")
    invoice_hash = root.findtext("invoice_hash")
    invoice_date = root.findtext("invoice_date")
    customer_name = root.findtext("customer_address/customer_name")
    customer_state = root.findtext("customer_address/customer_state")
    customer_city = root.findtext("customer_address/customer_city")
    company_name = root.findtext("company_address/company_name")
    company_state = root.findtext("company_address/company_state")
    company_city = root.findtext("company_address/company_city")
    vat_amount = root.findtext("tax_amount/vat_amount")
    vat_breakdown_amount = root.findtext("tax_amount/vat_breakdown_amount")
    tax_exemption_code = root.findtext("tax_amount/tax_exemption_code")
    tax_exemption_amount = root.findtext("tax_amount/tax_exemption_amount")
    
    invoice_lines = []
    for line in root.findall("invoice_line"):
        line_data = {
            "invoice_line_identifier": int(line.findtext("invoice_line_identifier")),
            "invoice_line_amount": line.findtext("invoice_line_amount"),
            "price" : line.findtext("price"),
            "invoice_start_date": line.findtext("invoice_start_date"),
            "invoice_end_date": line.findtext("invoice_end_date"),
            "invoice_total_amount": line.findtext("invoice_total_amount")
        }
        invoice_lines.append(line_data)
    
    return {
        "invoice_number": invoice_number,
        "unique_invoice_identifier": unique_invoice_identifier,
        "account_number": account_number,
        "invoice_hash": invoice_hash,
        "invoice_date": invoice_date,
        "customer_name": customer_name,
        "customer_state": customer_state,
        "customer_city": customer_city,
        "company_name": company_name,
        "company_state": company_state,
        "company_city": company_city,
        "vat_amount" : vat_amount,
        "vat_breakdown_amount" : vat_breakdown_amount,
        "tax_exemption_code" : tax_exemption_code,
        "tax_exemption_amount" : tax_exemption_amount,
        "invoice_lines": invoice_lines
    }

# Register UDF
parse_xml_udf = udf(parse_xml, returnType=StructType([
    StructField("invoice_number", StringType(), True),
    StructField("unique_invoice_identifier", StringType(), True),
    StructField("account_number", StringType(), True),
    StructField("invoice_hash", StringType(), True),
    StructField("invoice_date", StringType(), True),
    StructField("customer_name", StringType(), True),
    StructField("customer_state", StringType(), True),
    StructField("customer_city", StringType(), True),
    StructField("company_name", StringType(), True),
    StructField("company_state", StringType(), True),
    StructField("company_city", StringType(), True),
    StructField("vat_amount", StringType(), True),
    StructField("vat_breakdown_amount", StringType(), True),
    StructField("tax_exemption_code", StringType(), True),
   StructField("tax_exemption_amount", StringType(), True),
    StructField("invoice_lines", ArrayType(StructType([
        StructField("invoice_line_identifier", IntegerType(), True),
        StructField("invoice_line_amount", StringType(), True),
        StructField("price", StringType(), True),
        StructField("invoice_start_date", StringType(), True),
        StructField("invoice_end_date", StringType(), True),
        StructField("invoice_total_amount", StringType(), True)
    ])), True)
]))

# Apply UDF to parse XML content
parsed_df = df.withColumn("parsed_data", parse_xml_udf(df["xml_content"]))



In [0]:
# Select columns of interest from parsed data
parsed_xml_df = parsed_df.select("file_name","xml_content", "parsed_data.*")

In [0]:
parsed_df.display()

file_name,xml_content,parsed_data
dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0,"List(233462839-SS, k237165c-6dc2-4695-9635-6245f8913c99, 123-456-680, SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==, 30-05-2024, Alpha, Canes, North, Southern Investors, Texas, Atlanta, 461.0, 630.0, ES, 85.0, List(List(1, 690.0, 290.0, 30-05-2024, 05-06-2024, 880.0)))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,"List(123462868-SS, f237165c-6dc2-4695-9635-6245f8913c94, 123-456-659, GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==, 21-05-2024, Jane Christ, Texas, North, Southern Investors, Texas, Atlanta, 551.0, 200.0, ES, 65.0, List(List(1, 400.0, 250.0, 21-05-2024, 31-05-2024, 760.0), List(2, 200.0, 280.0, 21-05-2024, 04-06-2024, 890.0)))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,"List(123462838-SS, f237165c-6dc2-4695-9635-6245f8913c54, 123-456-659, GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==, 20-05-2024, Jane Christ, Texas, North, Southern Investors, Texas, Atlanta, 251.0, 400.0, ES, 55.0, List(List(1, 300.0, 200.0, 20-05-2024, 31-05-2024, 750.0), List(2, 200.0, 250.0, 20-05-2024, 04-06-2024, 850.0)))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0,"List(123462835-SS, f237165c-6dc2-4695-9635-6245f8913c65, 123-456-680, QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==, 20-05-2024, Alpha, Canes, North, Southern Investors, Texas, Atlanta, 351.0, 400.0, ES, 55.0, List(List(1, 600.0, 200.0, 20-05-2024, 31-05-2024, 750.0)))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,"List(123462837-SS, f237165c-6dc2-4695-9635-6245f8913c54, 123-456-658, MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==, 18-05-2024, Joseph Christ, Pebbas, North, Southern Investors, Texas, Atlanta, 250.0, 100.0, ES, 50.0, List(List(1, 200.0, 200.0, 18-05-2024, 31-05-2024, 450.0), List(2, 200.0, 200.0, 19-05-2024, 01-06-2024, 450.0)))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_05.xml,223462839-SS  g237165c-6dc2-4695-9635-6245f8913c95  123-456-680  SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==  29-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  451.0  600.0  ES  75.0  1  650.0  240.0  29-05-2024  03-06-2024  780.0,"List(223462839-SS, g237165c-6dc2-4695-9635-6245f8913c95, 123-456-680, SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==, 29-05-2024, Alpha, Canes, North, Southern Investors, Texas, Atlanta, 451.0, 600.0, ES, 75.0, List(List(1, 650.0, 240.0, 29-05-2024, 03-06-2024, 780.0)))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,"List(123462869-SS, l237165c-6dc2-4695-9635-6245f8913g57, 123-456-659, KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==, 26-05-2024, Jane Christ, SA, North, Southern Investors, Texas, Atlanta, 251.0, 450.0, ES, 55.0, List(List(1, 300.0, 200.0, 20-05-2024, 31-05-2024, 750.0), List(2, 200.0, 250.0, 20-05-2024, 04-06-2024, 850.0)))"


In [0]:
parsed_xml_df.display()

file_name,xml_content,invoice_number,unique_invoice_identifier,account_number,invoice_hash,invoice_date,customer_name,customer_state,customer_city,company_name,company_state,company_city,vat_amount,vat_breakdown_amount,tax_exemption_code,tax_exemption_amount,invoice_lines
dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0,233462839-SS,k237165c-6dc2-4695-9635-6245f8913c99,123-456-680,SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==,30-05-2024,Alpha,Canes,North,Southern Investors,Texas,Atlanta,461.0,630.0,ES,85.0,"List(List(1, 690.0, 290.0, 30-05-2024, 05-06-2024, 880.0))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,123-456-659,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Jane Christ,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,"List(List(1, 400.0, 250.0, 21-05-2024, 31-05-2024, 760.0), List(2, 200.0, 280.0, 21-05-2024, 04-06-2024, 890.0))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,123-456-659,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Jane Christ,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,"List(List(1, 300.0, 200.0, 20-05-2024, 31-05-2024, 750.0), List(2, 200.0, 250.0, 20-05-2024, 04-06-2024, 850.0))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0,123462835-SS,f237165c-6dc2-4695-9635-6245f8913c65,123-456-680,QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Alpha,Canes,North,Southern Investors,Texas,Atlanta,351.0,400.0,ES,55.0,"List(List(1, 600.0, 200.0, 20-05-2024, 31-05-2024, 750.0))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,123-456-658,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Joseph Christ,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,"List(List(1, 200.0, 200.0, 18-05-2024, 31-05-2024, 450.0), List(2, 200.0, 200.0, 19-05-2024, 01-06-2024, 450.0))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_05.xml,223462839-SS  g237165c-6dc2-4695-9635-6245f8913c95  123-456-680  SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==  29-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  451.0  600.0  ES  75.0  1  650.0  240.0  29-05-2024  03-06-2024  780.0,223462839-SS,g237165c-6dc2-4695-9635-6245f8913c95,123-456-680,SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==,29-05-2024,Alpha,Canes,North,Southern Investors,Texas,Atlanta,451.0,600.0,ES,75.0,"List(List(1, 650.0, 240.0, 29-05-2024, 03-06-2024, 780.0))"
dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,123-456-659,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,26-05-2024,Jane Christ,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,"List(List(1, 300.0, 200.0, 20-05-2024, 31-05-2024, 750.0), List(2, 200.0, 250.0, 20-05-2024, 04-06-2024, 850.0))"


In [0]:
exploded_df = parsed_xml_df \
    .withColumn("invoice_lines", explode(parsed_xml_df["invoice_lines"])) \
    .select("file_name", "xml_content", "invoice_number", "unique_invoice_identifier", 
            "account_number", "invoice_hash", "invoice_date",
            "customer_name", "customer_state", "customer_city",
            "company_name", "company_state", "company_city", "vat_amount", "vat_breakdown_amount", "tax_exemption_code", "tax_exemption_amount",
            "invoice_lines.*")

In [0]:
exploded_df.display()

file_name,xml_content,invoice_number,unique_invoice_identifier,account_number,invoice_hash,invoice_date,customer_name,customer_state,customer_city,company_name,company_state,company_city,vat_amount,vat_breakdown_amount,tax_exemption_code,tax_exemption_amount,invoice_line_identifier,invoice_line_amount,price,invoice_start_date,invoice_end_date,invoice_total_amount
dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0,233462839-SS,k237165c-6dc2-4695-9635-6245f8913c99,123-456-680,SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==,30-05-2024,Alpha,Canes,North,Southern Investors,Texas,Atlanta,461.0,630.0,ES,85.0,1,690.0,290.0,30-05-2024,05-06-2024,880.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,123-456-659,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Jane Christ,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,1,400.0,250.0,21-05-2024,31-05-2024,760.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,123-456-659,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Jane Christ,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,2,200.0,280.0,21-05-2024,04-06-2024,890.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,123-456-659,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Jane Christ,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,1,300.0,200.0,20-05-2024,31-05-2024,750.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,123-456-659,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Jane Christ,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,2,200.0,250.0,20-05-2024,04-06-2024,850.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0,123462835-SS,f237165c-6dc2-4695-9635-6245f8913c65,123-456-680,QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Alpha,Canes,North,Southern Investors,Texas,Atlanta,351.0,400.0,ES,55.0,1,600.0,200.0,20-05-2024,31-05-2024,750.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,123-456-658,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Joseph Christ,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,1,200.0,200.0,18-05-2024,31-05-2024,450.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,123-456-658,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Joseph Christ,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,2,200.0,200.0,19-05-2024,01-06-2024,450.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_05.xml,223462839-SS  g237165c-6dc2-4695-9635-6245f8913c95  123-456-680  SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==  29-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  451.0  600.0  ES  75.0  1  650.0  240.0  29-05-2024  03-06-2024  780.0,223462839-SS,g237165c-6dc2-4695-9635-6245f8913c95,123-456-680,SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==,29-05-2024,Alpha,Canes,North,Southern Investors,Texas,Atlanta,451.0,600.0,ES,75.0,1,650.0,240.0,29-05-2024,03-06-2024,780.0
dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,123-456-659,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,26-05-2024,Jane Christ,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,1,300.0,200.0,20-05-2024,31-05-2024,750.0


In [0]:
static_inv_table_df = spark.read.format("delta").load("/FileStore/tables/invoice_tables/invoice_cust_data_stg")
static_inv_table_df = static_inv_table_df.select("account_number","customer_name")

In [0]:
#invoice_status is EDIT for matching records
joined_df = exploded_df.join(static_inv_table_df, on=["account_number","customer_name"], how="inner")
streaming_match_df = joined_df.withColumn("invoice_status", lit("EDIT"))

In [0]:
streaming_match_df.display()

account_number,customer_name,file_name,xml_content,invoice_number,unique_invoice_identifier,invoice_hash,invoice_date,customer_state,customer_city,company_name,company_state,company_city,vat_amount,vat_breakdown_amount,tax_exemption_code,tax_exemption_amount,invoice_line_identifier,invoice_line_amount,price,invoice_start_date,invoice_end_date,invoice_total_amount,invoice_status
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,1,400.0,250.0,21-05-2024,31-05-2024,760.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,2,200.0,280.0,21-05-2024,04-06-2024,890.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,1,300.0,200.0,20-05-2024,31-05-2024,750.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,2,200.0,250.0,20-05-2024,04-06-2024,850.0,EDIT
123-456-658,Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,1,200.0,200.0,18-05-2024,31-05-2024,450.0,EDIT
123-456-658,Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,2,200.0,200.0,19-05-2024,01-06-2024,450.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,26-05-2024,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,1,300.0,200.0,20-05-2024,31-05-2024,750.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,26-05-2024,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,2,200.0,250.0,20-05-2024,04-06-2024,850.0,EDIT


In [0]:
#invoice_status is ADD for new records
joined_df1 = exploded_df.join(static_inv_table_df, on=["account_number","customer_name"], how="left_anti")
streaming_non_match_df = joined_df1.withColumn("invoice_status", lit("ADD"))

In [0]:
streaming_non_match_df.display()

account_number,customer_name,file_name,xml_content,invoice_number,unique_invoice_identifier,invoice_hash,invoice_date,customer_state,customer_city,company_name,company_state,company_city,vat_amount,vat_breakdown_amount,tax_exemption_code,tax_exemption_amount,invoice_line_identifier,invoice_line_amount,price,invoice_start_date,invoice_end_date,invoice_total_amount,invoice_status
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0,233462839-SS,k237165c-6dc2-4695-9635-6245f8913c99,SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==,30-05-2024,Canes,North,Southern Investors,Texas,Atlanta,461.0,630.0,ES,85.0,1,690.0,290.0,30-05-2024,05-06-2024,880.0,ADD
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0,123462835-SS,f237165c-6dc2-4695-9635-6245f8913c65,QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Canes,North,Southern Investors,Texas,Atlanta,351.0,400.0,ES,55.0,1,600.0,200.0,20-05-2024,31-05-2024,750.0,ADD
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_05.xml,223462839-SS  g237165c-6dc2-4695-9635-6245f8913c95  123-456-680  SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==  29-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  451.0  600.0  ES  75.0  1  650.0  240.0  29-05-2024  03-06-2024  780.0,223462839-SS,g237165c-6dc2-4695-9635-6245f8913c95,SPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1LTYyNDPmODkxM2K1NA==,29-05-2024,Canes,North,Southern Investors,Texas,Atlanta,451.0,600.0,ES,75.0,1,650.0,240.0,29-05-2024,03-06-2024,780.0,ADD


In [0]:
# Union the matching and non-matching DataFrames
streaming_df = streaming_match_df.union(streaming_non_match_df)

In [0]:
streaming_df.display()

account_number,customer_name,file_name,xml_content,invoice_number,unique_invoice_identifier,invoice_hash,invoice_date,customer_state,customer_city,company_name,company_state,company_city,vat_amount,vat_breakdown_amount,tax_exemption_code,tax_exemption_amount,invoice_line_identifier,invoice_line_amount,price,invoice_start_date,invoice_end_date,invoice_total_amount,invoice_status
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,1,400.0,250.0,21-05-2024,31-05-2024,760.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,21-05-2024,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,2,200.0,280.0,21-05-2024,04-06-2024,890.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,1,300.0,200.0,20-05-2024,31-05-2024,750.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,2,200.0,250.0,20-05-2024,04-06-2024,850.0,EDIT
123-456-658,Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,1,200.0,200.0,18-05-2024,31-05-2024,450.0,EDIT
123-456-658,Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,18-05-2024,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,2,200.0,200.0,19-05-2024,01-06-2024,450.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,26-05-2024,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,1,300.0,200.0,20-05-2024,31-05-2024,750.0,EDIT
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,26-05-2024,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,2,200.0,250.0,20-05-2024,04-06-2024,850.0,EDIT
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0,233462839-SS,k237165c-6dc2-4695-9635-6245f8913c99,SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==,30-05-2024,Canes,North,Southern Investors,Texas,Atlanta,461.0,630.0,ES,85.0,1,690.0,290.0,30-05-2024,05-06-2024,880.0,ADD
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0,123462835-SS,f237165c-6dc2-4695-9635-6245f8913c65,QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,20-05-2024,Canes,North,Southern Investors,Texas,Atlanta,351.0,400.0,ES,55.0,1,600.0,200.0,20-05-2024,31-05-2024,750.0,ADD


In [0]:
final_df = streaming_df \
    .withColumn("target_committime", current_timestamp()) \
    .withColumn("load_month", to_date(date_format(current_timestamp(), "yyyy-MM-01"), "yyyy-MM-dd")) \
    .withColumn("load_date", to_date(date_format(current_timestamp(), "yyyy-MM-dd"), "yyyy-MM-dd")) \
    .withColumn("invoice_start_date", to_date(streaming_df['invoice_start_date'], "dd-MM-yyyy")) \
    .withColumn("invoice_end_date", to_date(streaming_df['invoice_end_date'], "dd-MM-yyyy")) \
    .withColumn("batch_id", expr(f"concat('BATCH-', date_format(current_timestamp(), 'yyyy-MM-dd-HH_mm_ss'))")) \
    .withColumn("invoice_date", to_date(streaming_df['invoice_date'], "dd-MM-yyyy")) \
    .withColumn("vat_amount",streaming_df['vat_amount'].cast(DecimalType(10, 2))) \
    .withColumn("vat_breakdown_amount",streaming_df['vat_breakdown_amount'].cast(DecimalType(10, 2))) \
    .withColumn("tax_exemption_amount",streaming_df['tax_exemption_amount'].cast(DecimalType(10, 2))) \
    .withColumn("invoice_line_amount",streaming_df['invoice_line_amount'].cast(DecimalType(10, 2))) \
    .withColumn("price",streaming_df['price'].cast(DecimalType(10, 2))) \
    .withColumn("invoice_total_amount",streaming_df['invoice_total_amount'].cast(DecimalType(10, 2))) 

In [0]:
final_df.display()

account_number,customer_name,file_name,xml_content,invoice_number,unique_invoice_identifier,invoice_hash,invoice_date,customer_state,customer_city,company_name,company_state,company_city,vat_amount,vat_breakdown_amount,tax_exemption_code,tax_exemption_amount,invoice_line_identifier,invoice_line_amount,price,invoice_start_date,invoice_end_date,invoice_total_amount,invoice_status,target_committime,load_month,load_date,batch_id
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,2024-05-21,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,1,400.0,250.0,2024-05-21,2024-05-31,760.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml,123462868-SS  f237165c-6dc2-4695-9635-6245f8913c94  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==  21-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  551.0  200.0  ES  65.0  1  400.0  250.0  21-05-2024  31-05-2024  760.0  2  200.0  280.0  21-05-2024  04-06-2024  890.0,123462868-SS,f237165c-6dc2-4695-9635-6245f8913c94,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1KA==,2024-05-21,Texas,North,Southern Investors,Texas,Atlanta,551.0,200.0,ES,65.0,2,200.0,280.0,2024-05-21,2024-06-04,890.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,2024-05-20,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,1,300.0,200.0,2024-05-20,2024-05-31,750.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml,123462838-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-659  GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Jane Christ  Texas  North  Southern Investors  Texas  Atlanta  251.0  400.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462838-SS,f237165c-6dc2-4695-9635-6245f8913c54,GHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,2024-05-20,Texas,North,Southern Investors,Texas,Atlanta,251.0,400.0,ES,55.0,2,200.0,250.0,2024-05-20,2024-06-04,850.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-658,Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,2024-05-18,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,1,200.0,200.0,2024-05-18,2024-05-31,450.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-658,Joseph Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml,123462837-SS  f237165c-6dc2-4695-9635-6245f8913c54  123-456-658  MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  18-05-2024  Joseph Christ  Pebbas  North  Southern Investors  Texas  Atlanta  250.0  100.0  ES  50.0  1  200.0  200.0  18-05-2024  31-05-2024  450.0  2  200.0  200.0  19-05-2024  01-06-2024  450.0,123462837-SS,f237165c-6dc2-4695-9635-6245f8913c54,MTIzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,2024-05-18,Pebbas,North,Southern Investors,Texas,Atlanta,250.0,100.0,ES,50.0,2,200.0,200.0,2024-05-19,2024-06-01,450.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,2024-05-26,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,1,300.0,200.0,2024-05-20,2024-05-31,750.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-659,Jane Christ,dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml,123462869-SS  l237165c-6dc2-4695-9635-6245f8913g57  123-456-659  KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==  26-05-2024  Jane Christ  SA  North  Southern Investors  Texas  Atlanta  251.0  450.0  ES  55.0  1  300.0  200.0  20-05-2024  31-05-2024  750.0  2  200.0  250.0  20-05-2024  04-06-2024  850.0,123462869-SS,l237165c-6dc2-4695-9635-6245f8913g57,KHTzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NtM1LTYyNDVmODkxM2M1MP==,2024-05-26,SA,North,Southern Investors,Texas,Atlanta,251.0,450.0,ES,55.0,2,200.0,250.0,2024-05-20,2024-06-04,850.0,EDIT,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml,233462839-SS  k237165c-6dc2-4695-9635-6245f8913c99  123-456-680  SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==  30-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  461.0  630.0  ES  85.0  1  690.0  290.0  30-05-2024  05-06-2024  880.0,233462839-SS,k237165c-6dc2-4695-9635-6245f8913c99,SPcvNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS15NjM1TTYhNDPmODkxM2K1FG==,2024-05-30,Canes,North,Southern Investors,Texas,Atlanta,461.0,630.0,ES,85.0,1,690.0,290.0,2024-05-30,2024-06-05,880.0,ADD,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08
123-456-680,Alpha,dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml,123462835-SS  f237165c-6dc2-4695-9635-6245f8913c65  123-456-680  QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==  20-05-2024  Alpha  Canes  North  Southern Investors  Texas  Atlanta  351.0  400.0  ES  55.0  1  600.0  200.0  20-05-2024  31-05-2024  750.0,123462835-SS,f237165c-6dc2-4695-9635-6245f8913c65,QPxzNDYyODM3LVNTX2YyMzcxNjVjLTZkYzItNDY5NS05NjM1LTYyNDVmODkxM2M1NA==,2024-05-20,Canes,North,Southern Investors,Texas,Atlanta,351.0,400.0,ES,55.0,1,600.0,200.0,2024-05-20,2024-05-31,750.0,ADD,2024-07-21T08:35:08.232+0000,2024-07-01,2024-07-21,BATCH-2024-07-21-08_35_08


In [0]:

def archive_files(final_temp,batch_id):

    invoice_df = final_temp.select("xml_content", "invoice_number", "unique_invoice_identifier", 
            "account_number", "invoice_hash", "invoice_date",
            "customer_name","customer_state", "customer_city",
            "company_name", "company_state", "company_city","invoice_status","file_name",
            "target_committime","load_month","load_date","batch_id").dropDuplicates(["invoice_number", "unique_invoice_identifier", 
            "account_number"])

    vat_df = final_temp.select("invoice_number", "unique_invoice_identifier", 
            "account_number","customer_name","vat_amount", "vat_breakdown_amount", "tax_exemption_code", "tax_exemption_amount","invoice_status",
            "target_committime","load_month","load_date","batch_id").dropDuplicates(["invoice_number", "unique_invoice_identifier", 
            "account_number"])

    
    invoice_line_df = final_temp.select("invoice_number", "unique_invoice_identifier", 
            "account_number","customer_name","invoice_line_identifier", "invoice_line_amount", "price", "invoice_start_date",
            "invoice_end_date", "invoice_total_amount","invoice_status","target_committime","load_month","load_date","batch_id").dropDuplicates(["invoice_number", "unique_invoice_identifier", "account_number", "invoice_line_identifier"])
    

    invoice_df.createOrReplaceTempView("invoice_temp")
    merge_statement = """merge into invoice_db.invoice_cust_data_stg t using invoice_temp s
    on t.account_number == s.account_number and t.invoice_number == s.invoice_number and t.unique_invoice_identifier == s.unique_invoice_identifier 
    when matched then
    update set
    t.invoice_date = s.invoice_date, t.customer_name = s.customer_name, t.customer_state = s.customer_state, t.customer_city = s.customer_city, t.company_name = s.company_name,
    t.company_state = s.company_state, t.company_city = s.company_city, t.invoice_status = s.invoice_status, t.file_name = s.file_name, t.target_committime = s.target_committime, 
    t.load_month = s.load_month, t.load_date = s.load_date, t.batch_id = s.batch_id
    when not matched then
    insert *
    """
    invoice_df._jdf.sparkSession().sql(merge_statement)


    vat_df.createOrReplaceTempView("vat_temp")
    merge_statement = """merge into invoice_db.invoice_vat_data_stg t using vat_temp s
    on t.account_number == s.account_number and t.invoice_number == s.invoice_number and t.unique_invoice_identifier == s.unique_invoice_identifier 
    when matched then
    update set
    t.customer_name = s.customer_name, t.vat_amount = s.vat_amount, t.vat_breakdown_amount = s.vat_breakdown_amount, t.tax_exemption_code = s.tax_exemption_code,
    t.tax_exemption_amount = s.tax_exemption_amount, t.invoice_status = s.invoice_status, t.target_committime = s.target_committime, 
    t.load_month = s.load_month, t.load_date = s.load_date, t.batch_id = s.batch_id
    when not matched then
    insert *
    """
    vat_df._jdf.sparkSession().sql(merge_statement)


    invoice_line_df.createOrReplaceTempView("invoice_line_temp")
    merge_statement = """merge into invoice_db.invoice_line_data_stg t using invoice_line_temp s
    on t.account_number == s.account_number and t.invoice_number == s.invoice_number and t.unique_invoice_identifier == s.unique_invoice_identifier 
    and t.invoice_line_identifier == s.invoice_line_identifier
    when matched then
    update set 
    t.invoice_line_amount == s.invoice_line_amount , t.price == s.price , t.invoice_start_date == s.invoice_start_date,t.invoice_end_date == s.invoice_end_date , 
    t.invoice_total_amount == s.invoice_total_amount , t.invoice_status = s.invoice_status, t.target_committime = s.target_committime, 
    t.load_month = s.load_month, t.load_date = s.load_date, t.batch_id = s.batch_id
    when not matched then
    insert *
    """
    invoice_line_df._jdf.sparkSession().sql(merge_statement)


    # List all files in the DBFS directory
    files = dbutils.fs.ls("/FileStore/tables/invoices/xml_landing")

    # Filter the list to include only .xml files
    xml_files = [file.path for file in files if file.path.endswith(".xml")]

    # Copy each XML file to the local directory
    for xml_file in xml_files:
        archive_path = "/FileStore/tables/invoices/xml_archive"  
        dbutils.fs.mv(xml_file, archive_path)
        print(f"Moved {xml_file} to {archive_path}")
    

In [0]:
'''streaming_query = invoice_df \
.writeStream \
.format("delta") \
.outputMode("complete") \
.option("checkpointLocation","/FileStore/tables/invoices/invoice_cust_data_chkpt") \
.toTable("invoice_db.invoice_cust_data_stg")'''

Out[46]: 'streaming_query = invoice_df .writeStream .format("delta") .outputMode("complete") .option("checkpointLocation","/FileStore/tables/invoices/invoice_cust_data_chkpt") .toTable("invoice_db.invoice_cust_data_stg")'

In [0]:
query = final_df.writeStream \
    .foreachBatch(archive_files) \
    .outputMode("update") \
    .option("checkpointLocation", "/FileStore/tables/invoices/invoice_cust_data_chkpt1") \
    .start()

query.awaitTermination()


Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_file_01.xml to /FileStore/tables/invoices/xml_archive
Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_file_02.xml to /FileStore/tables/invoices/xml_archive
Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_03.xml to /FileStore/tables/invoices/xml_archive
Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_04.xml to /FileStore/tables/invoices/xml_archive
Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_05.xml to /FileStore/tables/invoices/xml_archive
Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_06.xml to /FileStore/tables/invoices/xml_archive
Moved dbfs:/FileStore/tables/invoices/xml_landing/xml_07.xml to /FileStore/tables/invoices/xml_archive
