config table -
1. Process_id
2. Source_bucket_path
3. Source_file_format 
4. Source_file_pattern
5. Sink_delta_table
6. Column_delimeter
7. Preprocessing_steps - Json string   - unzip , pgp etc
8. Min_number_of_records
9. Max_latency_hours
10. Active
11. checkpoint location

In [0]:
##readStream Options

spark_read_options = {
  "inferSchema" : True,
  "sep" : ',' ,
  "header" :  True ,
  "multiLine" : True ,
  "maxFilesPerTrigger" : 1,
  "mergeSchema": True
}

spark_write_options = {  
  "OutputMode" : "append",
  "format" : "delta",
}

In [0]:
#set spark merge schema property to true
spark.sql(f"SET spark.databricks.delta.schema.autoMerge.enabled = true") 

Out[4]: DataFrame[key: string, value: string]

In [0]:
%run /vs_unified_streaming-Test/unified_delta_writer

In [0]:
import os
import pathlib
import fnmatch
import pandas as pd
import boto3
import delta
import pyspark.sql.functions as F
from pyspark.sql.functions import input_file_name
import pyspark.sql.types as T 
from datetime import datetime

class Process(deltawriter):
    
    #validation_seq = ['UPPER', ]#'WITHOUT_SPECIAL_CHARS', 'NUMBER_FORMAT', 'CONVERT_DTTM_FORMAT']
    

    def __init__(self, process_id , dq_table):
    #read active configurations
        config_list = spark.sql(f"select * from {config_delta_table}").filter("active == 'y'").filter(f"process_id == '{process_id}'").collect()[0]
        self.validation_seq = ['UPPER','CONVERT_DTTM_FORMAT' ]#'WITHOUT_SPECIAL_CHARS', 'NUMBER_FORMAT']
        self.method_map = {'UPPER': 'to_upper', 'WITHOUT_SPECIAL_CHARS': 'to_without_special_chars', 'NUMBER_FORMAT': 'to_number_format', 'CONVERT_DTTM_FORMAT': 'to_convert_dttm_format'}
        ##set class properties
        self.process_id = process_id
        self.source_bucket_path = config_list.source_bucket_path
        self.source_file_format = config_list.source_file_format
        self.source_file_pattern = config_list.source_file_pattern
        self.sink_delta_table = config_list.sink_delta_table
        self.column_delimeter = config_list.column_delimeter
        self.groupbykey = config_list.groupbykey
        self.count_col = config_list.count_col
        #self.preprocessing_steps - Json string   - unzip , pgp etc
        self.preprocessing_steps = config_list.preprocessing_steps
        self.min_number_of_records = config_list.min_number_of_records
        self.max_latency_hours = config_list.max_latency_hours
        self.checkpoint = config_list.checkpoint
        self.input_schema = self.gen_schema()
        self.dq_table = dq_table
        
        #print(self.input_schema)
    
    #@staticmethod
    def gen_schema(self ):
        print(self.source_file_pattern)
#         sample_file = dbutils.fs.ls(f"{self.source_bucket_path}{self.source_file_pattern}")[0]
#         schema_olist_order_items =  spark.read.csv(sample_file, header = True)
        return (spark.read.load(f"{self.source_bucket_path}{self.source_file_pattern}", header='true', format='csv', inferSchema='true').schema 
        )
            
    def trigger_deltawriter(self,batchDf , batchID):
        super().__init__(batchDf , batchID , self.process_id , self.sink_delta_table , self.dq_table,self.validation_seq, self.method_map , self.groupbykey , self.count_col)
        #Bronze
        self.batchdf.persist()    
        self.append_logging_delta()
        self.writeDeltaBronze()
        self.create_register_delta("silver")
        
        #silver
        #df = spark.sql(f"select * from global_temp.batchData")
        #df = self.modify_execute_timestamp(df, "silver")
        df = self.pk_check()
        df = self.null_check(df)
        df = self.modify_execute_timestamp(df)
        #df.createOrReplaceGlobalTempView("batchData")
        df = self.apply_dq_checks(df)
        #df.createOrReplaceGlobalTempView("batchData")
        self.deduplicate_merge_silver(df)
        self.deduplicate_merge_gold(df)
        self.batchdf.unpersist()
        

    
    def trigger_streams(self):
        input_stream_df  = (
          spark
          .readStream
          .format(self.source_file_format)
          .schema(self.input_schema)  #genschema
          .options(**spark_read_options)
          .load(self.source_bucket_path+self.source_file_pattern)
          #.select("*","_metadata.file_name", "_metadata.file_size")
          .select('*', input_file_name().alias("_filename") ,\
                  F.expr("uuid()").alias("_unique_id"),\
                  F.lit(datetime.now()).cast(T.TimestampType()).alias ("_execute_timestamp"),
                  F.lit("bronze").cast(T.StringType()).alias("_record_type"),
                 )
        )
        
        query = (  
          input_stream_df     #add metadata column
          .writeStream
          .foreachBatch(self.trigger_deltawriter)
          .queryName(self.process_id)
          #.format(spark_write_options['format'])
          #.outputMode(spark_write_options['OutputMode'])
          .option( "checkpointLocation", os.path.join (self.checkpoint, "bronze", self.process_id))
        )
        #query.toTable(self.sink_delta_table)
        #query.start()
        return (query , input_stream_df)

