### Create a monitoring table (if not exists)

In [None]:
from pyspark.sql.types import *
from delta.tables import *

DeltaTable.createIfNotExists(spark) \
    .tableName("monitoring_etl") \
    .addColumn("Table_name", StringType()) \
    .addColumn("load_timestamp", TimestampType()) \
    .addColumn("Source_path", StringType()) \
    .addColumn("record_count", IntegerType()) \
    .addColumn("Status", StringType()) \
    .addColumn("error_message", StringType()) \
    .addColumn("duration", FloatType()) \
    .execute()

StatementMeta(, 248d8df2-04ec-42c7-b79e-07314a63c49b, 3, Finished, Available, Finished)

<delta.tables.DeltaTable at 0x7fda1893be90>

### Ingesting ZIP file from github (orders.zip)

In [5]:
from pyspark.sql.types import *
import requests, zipfile, os, math
from datetime import datetime
from pyspark.sql.types import *
from pyspark.sql.functions import * 
import time

### orders.zip download ###

zip_url =  "https://github.com/MicrosoftLearning/dp-data/raw/main/orders.zip"
local_zip_path = "/lakehouse/default/Files/bronze/orders.zip"
extracted_path = "/lakehouse/default/Files/bronze/"
os.mkdir(extracted_path)

response = requests.get(zip_url)

with open (local_zip_path, "wb") as f:
    f.write(response.content)

### orders.zip extraction ###

with zipfile.ZipFile(local_zip_path, "r") as zip_ref:
    zip_ref.extractall(extracted_path)

print(os.listdir(extracted_path))

### push csv files to bronze ###

orderSchema = StructType([
    StructField("SalesOrderNumber", StringType()),
    StructField("SalesOrderLineNumber", IntegerType()),
    StructField("OrderDate", DateType()),
    StructField("CustomerName", StringType()),
    StructField("Email", StringType()),
    StructField("Item", StringType()),
    StructField("Quantity", IntegerType()),
    StructField("UnitPrice", FloatType()),
    StructField("Tax", FloatType())
    ])

start_time = time.time()
load_path = "Files/bronze/*.csv"
table_name = "orders_bronze"

try: 
    df = spark.read.format("csv").option("header", "true").schema(orderSchema).load(load_path)
    df.write.mode("overwrite").saveAsTable("orders_bronze")

    record_count = df.count()
    load_status = "SUCCESS"
    error_message = None

except Exception as e:
    record_count = 0
    load_status = "FAILURE"
    error_message = str(e)

end_time = time.time()
duration = (end_time - start_time)

monitoring_schema = StructType([
    StructField("load_timestamp", TimestampType()),
    StructField("Table_name", StringType()),
    StructField("Source_path", StringType()),
    StructField("record_count", IntegerType()),
    StructField("Status", StringType()),
    StructField("error_message", StringType()),
    StructField("duration", FloatType())
])

monitoring_data = [(datetime.now(), table_name, load_path, record_count, load_status, error_message, duration)]

monitoring_df = spark.createDataFrame(monitoring_data, schema=monitoring_schema)

monitoring_df.write.mode("append").saveAsTable("monitoring_etl")

StatementMeta(, 248d8df2-04ec-42c7-b79e-07314a63c49b, 6, Finished, Available, Finished)

['2019.csv', '2021.csv', '2020.csv', 'orders.zip']
