In [0]:
pip install lxml

Python interpreter will be restarted.
Collecting lxml
  Downloading lxml-5.2.2-cp39-cp39-manylinux_2_28_x86_64.whl (5.0 MB)
Installing collected packages: lxml
Successfully installed lxml-5.2.2
Python interpreter will be restarted.


In [0]:
import os
from lxml import etree
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from pyspark.sql.functions import current_timestamp, input_file_name
import os
import subprocess
import smtplib
from email.mime.text import MIMEText
from email.mime.multipart import MIMEMultipart

In [0]:
import smtplib
from email.mime.text import MIMEText

# SMTP server configuration
host = "smtp.gmail.com"
port = 587  # TLS port

# Create a text widget for the google app password and mail ID
dbutils.widgets.text("app_password", "")
dbutils.widgets.text("sender_mailId", "")
dbutils.widgets.text("receiver_mailId", "")

# Email content
FROM = dbutils.widgets.get("sender_mailId")
TO = dbutils.widgets.get("receiver_mailId")
SUBJECT = "XML file got rejected"
BODY = """
Hi Team,\n\nXML files have been rejected. Kindly check the reject table <b>invoice_db.xml_reject_audit_table</b> for more information.\n\nThanks.
"""

def send_alert_mail(FROM,TO,BODY,SUBJECT,host,port):
    # Create MIMEText object
    msg = MIMEText(BODY, 'html')
    msg['Subject'] = SUBJECT
    msg['From'] = FROM
    msg['To'] = TO
    try:
        # Connect to the SMTP server
        server = smtplib.SMTP(host, port)
        server.starttls()  # Upgrade the connection to secure TLS

        # Login to the SMTP server using app-specific password
        server.login(FROM, dbutils.widgets.get("app_password"))

        # Send email
        server.sendmail(FROM, TO, msg.as_string())
        print("Email sent successfully!")

    except Exception as e:
        print(f"Failed to send email: {e}")

    finally:
        # Quit the SMTP session
        server.quit()

In [0]:
dbutils.fs.mkdirs("/FileStore/tables/invoices/xml_landing")
dbutils.fs.mkdirs("/FileStore/tables/invoices/xml_reject")
dbutils.fs.mkdirs("/FileStore/tables/invoices/xsd")
dbutils.fs.mkdirs("/FileStore/tables/invoices/xml_checkpoint")

Out[4]: True

In [0]:
source_directory = "/FileStore/tables/invoices/staging"
landing_directory = "/FileStore/tables/invoices/xml_landing"
rejects_directory = "/FileStore/tables/invoices/xml_reject"
xsd_path = "/FileStore/tables/invoices/xsd/xsd_for_generated_xml.xsd"
checkpoint_dir = "/FileStore/tables/invoices/xml_checkpoint"

In [0]:
# Load XSD schema from DBFS
xsd_content = dbutils.fs.head(xsd_path, 10000) 
schema_root = etree.XML(xsd_content.encode('utf-8'))
schema = etree.XMLSchema(schema_root)

In [0]:
# Create reject table schema
audit_schema = StructType([
    StructField("filename", StringType(), True),
    StructField("error_message", StringType(), True)
])

In [0]:
# Function to validate XML and capture error message
def validate_xml(xml_content):
    try:
        xml_doc = etree.XML(xml_content)
        schema.assertValid(xml_doc)
        return True, None
    except (etree.DocumentInvalid, etree.XMLSyntaxError) as e:
        return False, str(e)

In [0]:
def process_batch(df, epoch_id):
    # Collecting rows is inefficient and may cause repeated processing messages.
    xml_files = [os.path.join(source_directory, f.name) for f in dbutils.fs.ls(source_directory) if f.name.endswith('.xml')]
    reject_flag=False
    for row in xml_files:
        file_name = row
        source_path = row.replace("dbfs:", "")
        try:
            # Read XML content from DBFS
            xml_content = dbutils.fs.head(row, 10000)  # Adjust size if necessary
            is_valid, error_message = validate_xml(xml_content.encode('utf-8'))
            if is_valid:
                destination_path = landing_directory
                print(f"Validation successful: {file_name}")
            else:
                destination_path = rejects_directory
                print(f"Validation failed: {file_name}")

                # Write error message to audit table
                audit_data = [(file_name, error_message)]
                audit_df = spark.createDataFrame(audit_data, schema=audit_schema).withColumn("reject_date",current_timestamp())
                audit_df.write.format("delta").mode("append").saveAsTable("invoice_db.xml_reject_audit_table")

                # setting reject flag
                reject_flag = True   

            # Move file to destination
            dbutils.fs.mv(source_path, destination_path)
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")
    # Send alert email using shell script
    if reject_flag:
        send_alert_mail(FROM,TO,BODY,SUBJECT,host,port)


In [0]:
xml_df = spark.readStream.format("text").option("path", source_directory).load()

In [0]:
# Process each micro batch of files
xml_df.writeStream.foreachBatch(process_batch).option("checkpointLocation", checkpoint_dir).start().awaitTermination()

Validation failed: /FileStore/tables/invoices/staging/rejects14.xml
Email sent successfully!
