In [0]:
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled", "True") #adding mergeschema as True to resolve webform datatype issue

In [0]:
import pyspark.sql
from pyspark.sql.functions import udf,split, explode
import pyspark.sql.functions as F
import pyspark.sql.types
from pyspark.sql.functions import col, when, lit, coalesce, count 
from pyspark.sql.window import Window
import sys
from typing import Dict, List, Any
import pandas
import json
import datetime

In [0]:
%run Shared/functions/Functions_delta

In [0]:
input_file =dbutils.widgets.get("input_file") 
input_partition= dbutils.widgets.get("input_partition")
out_partition= dbutils.widgets.get("input_partition") if not dbutils.widgets.get("out_partition").strip() else dbutils.widgets.get("out_partition")

#get input parameters
with open(input_file, 'r') as file:
  data = file.read()
jsonObject = json.loads(data)

inputs: Dict[str, Dict[str, Any]] = jsonObject["stage"]
tableName=inputs["tableName"]
try:  
  tableName=inputs["tableName"] #table name is mandatory 
except:
  print("tableName is mandatory")  
  
try:  
  tableAppend=inputs["tableAppend"]
except:
  tableAppend='N'

try:  
  adls=inputs["adls"] #adls path eg. dlenterpriseanalytics.dfs.core.windows.net
except:
  print("adls is mandatory")
  raise

try:  
  adlsfolder= inputs["adlsfolder"] # The adls folder in raw container
except:
  adlsfolder = ""
try:  
  inferSchema = inputs["inferSchema"] # Boolean value
except:
  inferSchema = 'False'
try:  
  delimiter=inputs["delimiter"] # Delimiter of the file
except:
  delimiter=","
try:  
  file_type= inputs["file_type"] # CSV,JSON, Excel,etc
except:
  file_type= "CSV"
try:  
  tsFormat = inputs["tsFormat"]   #timestamp format 
except:
  tsFormat = "yyyy-MM-dd HH:mm:ss.S"
try:  
  dateFormat= inputs["dateFormat"] # datefromat
except:
  dateFormat= "yyyy-MM-dd"
try:
  filename= inputs["filename"] #True if filename column is needed. i.e. name of file from which record comes
except:
  filename = "False"

try:
  multiline ="False" if (not inputs["multiline"].strip() or inputs["multiline"] == "") else inputs["multiline"] # True when the input CSV file contains certain records on multiple lines. Use escape_char as \" and multiline True in such scenario. 
except:
  multiline = "False"
  
try:
  createPartitionformName=inputs["createPartitionformName"]
  moveLatestfile=inputs["moveLatestfile"]#Y if filename column is needed. i.e. name of file from which record comes
except:
  createPartitionformName = "N"
  moveLatestfile="N"
  
try:
  replace_char= "N" if (not inputs["replace_char"].strip() or inputs["replace_char"] == "") else inputs["replace_char"]
except:
  replace_char= "N"
try:
  quote_char= "N" if (not inputs["quote_char"].strip() or inputs["quote_char"]== "") else inputs["quote_char"]
except:
  quote_char= "N"
try:  
  escape_char= "N" if (not inputs["escape_char"].strip() or inputs["escape_char"] == "")  else inputs["escape_char"]
except:
  escape_char= "N"

try:
  cell_ref = inputs['cell_ref']
except:
  cell_ref = "!A1"
  
#adl container path for raw only for Excel we need to handle it separately
if file_type.upper() in ['MONGOJSON','CSV','EXCEL','JSON','MONGOJSONFORMCONFIG']:
  if file_type.upper()=='EXCEL':
    excel_name = inputs["excel_name"]
    sheetname = inputs["sheetname"] #if only one sheet then sheetname = []
    if len(input_partition) != 0:
      raw = 'dbfs:/mnt/raw/' +adlsfolder+tableName+"/"+input_partition + "/"+  excel_name
    else:
      if tableName == excel_name.split(".")[0]:
        raw = 'dbfs:/mnt/raw/' +adlsfolder+ "/"+tableName+".xlsx"
      else:
        raw = 'dbfs:/mnt/raw/' +adlsfolder+ "/"+tableName+"/"+  excel_name
  else:
    raw = "abfss://raw@"+adls+ "/" +adlsfolder+ "/"+tableName+"/"+input_partition
else:
  print("Invalid file type")
  raise 
stage = "abfss://deltastage@"+adls+ "/" +adlsfolder+tableName
##This code and option is specially added to process Raven files and should not be used for any other source
if createPartitionformName=="Y":
  mount_req = "dbfs:/mnt/"+tableName 
  lists = dbutils.fs.ls(mount_req)
  path=lists[0].path
  fileName=path[path.rfind('/')+1:]
  dateStr=path[path.rfind('_')+1:path.rfind('.json')-2]
  fileDatetime=datetime.strptime(dateStr, '%Y-%m-%dT%H:%M:%S.%f')
  out_partition="year="+str(fileDatetime.year)+"/month="+str(fileDatetime.month)+"/date="+str(fileDatetime.day)+"/hour="+str(fileDatetime.hour) 
  archive=raw+"archive/"+fileName
  raw=mount_req+"/"+fileName

In [0]:
#CSV File is processed , if you want schema to be inferred pass parameter as true other wise false , this also takes quote character and escape character and delimiter

if file_type.upper()=='CSV':
  df_incoming = readcsv(inferSchema, delimiter,quote_char,escape_char,multiline, raw) #function to read csv as per given inputs
  df_stage = pre_stage(df_incoming, tableAppend, tableName=tableName,dateFormat=dateFormat, tsFormat=tsFormat, filename=filename)

#Speciaal case is added to handle EXCEL files, this is special request where we can pass multiple sheets in single Excel and creates seprate tables for each sheet
#Excel sheet is expecetd to have tabular form and conform to schema

elif file_type.upper()=='EXCEL': 
  if len(sheetname)==0:
    df_incoming = spark.read.format("com.crealytics.spark.excel").option("header", "true").option("inferSchema", inferSchema).load(raw)
    df_stage = pre_stage(df_incoming,tableAppend, tableName=tableName,dateFormat=dateFormat, tsFormat=tsFormat)
 # if file type is excel then stage table will be created on folder tableName>sheetname
  else:    
    for sheet in sheetname:
      stage = "abfss://deltastage@"+adls+"/"+adlsfolder+ "/"+ tableName + "/" +sheet.replace(" ", "")
      ref = "'{}'".format(sheet)+ cell_ref
      df_incoming = spark.read.format("com.crealytics.spark.excel").option("header", "true").option("inferSchema", inferSchema).option('dataAddress', "{}".format(ref)).load(raw)
      #raw = '/dbfs/mnt/raw/' +adlsfolder+tableName+"/"+input_partition + "/"+  excel_name
      #df=[]
      #df = pandas.read_excel(raw, sheet_name=sheet,inferSchema=False,keep_default_na=False,na_values=[''],engine='openpyxl') # replaces '' with null, Dec 18, 2020: xlrd package stopped supporting xlsx and xlsm. Now using openpyxl.
      #df_incoming = spark.createDataFrame(df)
    # stage table for each sheet is same as sheet name. If sheet name is having spaces then it removes space 
      df_stage = pre_stage(df_incoming,tableAppend, tableName=tableName+"_"+sheet,dateFormat=dateFormat, tsFormat=tsFormat)

#This is special function written to parse Mongo data for Neutrinos: form_data
#This function unwinds column from Array and loads as table
elif file_type.upper()=='MONGOJSON':
  df_incoming = spark.read.json(raw)
  if len(out_partition)!=0:  
    df_incoming,stg_partition = add_partition(out_partition,df_incoming)
    schema = df_incoming.schema #Use the incoming schema instead of existing schema for form_data collection as schema will get changed for each new formfield value created in the webform
  else:
    stg_partition=[] 
    
#Adding the correct name for the function and correcting variable name from df_stage to df_prestage
  if len(stg_partition) != 0:  
    df_incoming.write.format("delta").mode("overwrite").partitionBy(stg_partition).option("path",stage).option("overwriteSchema", "true").\
            saveAsTable("deltastage."+tableName.replace(" ", ""))
  else:
    df_incoming.write.format("delta").mode("overwrite").option("path",stage).option("overwriteSchema", "true").saveAsTable("deltastage."+tableName.replace(" ", ""))

    
## This piece of code is only for formconfiguration, to explode the column of array    
elif file_type.upper()=='MONGOJSONFORMCONFIG':
  df_incoming = readcsv(inferSchema, delimiter,quote_char,escape_char,multiline, raw) #function to read csv as per given inputs
  df_stage = pre_stage_formconfig(df_incoming, tableAppend, tableName=tableName,dateFormat=dateFormat, tsFormat=tsFormat, filename=filename)

##This piece of code is only added for Raven JSON type and will be generalised for further,dont use this option other than RAVEN
elif file_type.upper()=='JSON':
  df_incoming = spark.read.json(raw).select(explode("trips").alias("trips")).select("trips.*")
  if len(out_partition)!=0:  
    df_incoming,stg_partition = add_partition(out_partition,df_incoming)
  else:
    stg_partition=[]    
  
  stage = "abfss://deltastage@"+adls+"/"+adlsfolder+tableName
  
  if len(stg_partition) != 0:  
    df_incoming.write.format("delta").mode("overwrite").partitionBy(stg_partition).option("path",stage).\
            saveAsTable("deltastage."+tableName.replace(" ", ""))
  else:
    df_incoming.write.format("delta").mode("overwrite").option("path",stage).saveAsTable("deltastage."+tableName.replace(" ", ""))
    
  spark.sql("refresh table deltastage.`" + tableName.replace(" ", "") + "`")
  
if moveLatestfile=="Y":
  dbutils.fs.mv(raw,archive)
