In [0]:
import json
import requests
from io import BytesIO
import datetime
from pyspark.sql.types import StructField, IntegerType, StringType, StructType, DoubleType, DateType, DecimalType, BooleanType, TimestampType

In [0]:
# Set KeyVault Variables:
secret_scope         = "dbscope"
storage_account_name = dbutils.secrets.get(scope=secret_scope, key="storageAccountName")
sas_token            = dbutils.secrets.get(scope=secret_scope, key="sastoken")
container_name       = dbutils.secrets.get(scope=secret_scope, key="containerName")

# SQL KeyVault Variables:
server_name          = dbutils.secrets.get(scope=secret_scope, key="serverName")
metadata_db          = dbutils.secrets.get(scope=secret_scope, key="metadataDatabaseName")
totesys_db           = dbutils.secrets.get(scope=secret_scope, key="totesysDatabaseName")
db_user              = dbutils.secrets.get(scope=secret_scope, key="sqlUser")
db_password          = dbutils.secrets.get(scope=secret_scope, key="sqlPassword")         

In [0]:
# Set Spark configuration:
spark.conf.set(f"fs.azure.account.auth.type.{storage_account_name}.dfs.core.windows.net", "SAS") 
spark.conf.set(f"fs.azure.sas.token.provider.type.{storage_account_name}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider") 
spark.conf.set(f"fs.azure.sas.fixed.token.{storage_account_name}.dfs.core.windows.net", sas_token) 

In [0]:
# Set locations to dataLake:
BRONZE          = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/framework-j1/BRONZE"
SILVER          = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/framework-j1/SILVER"
GOLD            = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/framework-j1/GOLD"

date_path       = datetime.datetime.now().strftime("%Y/%m/%d")

In [0]:
type_dict = {'int': IntegerType(), 'string': StringType(), 'decimal': DecimalType(), 'boolean': BooleanType(), 'datetime': DateType(), 'date': DateType(), 'timestamp': TimestampType(), 'decimal(10,2)': DecimalType(10,2)}

In [0]:
# Query framework-j1-db (metadata) database to collect entity names:
def query_entityNames(query):
  # Create and return dataFrame:
  return (spark.read
    .format("sqlserver")
    .option("host", server_name)
    .option("port", "1433")
    .option("user", db_user)
    .option("password", db_password)
    .option("database", metadata_db)
    .option("query", query)
    .load()
  ).collect()[0]['entityNames']

# Query framework-j1-db (metadata) database:
def query_sourceEntity(query):
# Create and return dataFrame:
  return (spark.read
    .format("sqlserver")
    .option("host", server_name)
    .option("port", "1433")
    .option("user", db_user)
    .option("password", db_password)
    .option("database", metadata_db)
    .option("query", query)
    .load()
  ).collect()

# Query the totesys database and return a dataframe:
def query_totesys(query):
  return (spark.read
  .format("sqlserver")
  .option("host", server_name)
  .option("port", "1433")
  .option("user", db_user)
  .option("password", db_password)
  .option("database", totesys_db)
  .option("query", query)
  .load()
)

# Function to return the bronze location for the given entity:
def entity_bronze(e):
  q = f"""
  SELECT bronzeLocation FROM sourceEntity
  WHERE entityName = '{e}'
  """
  # Create and return dataFrame:
  return (spark.read
    .format("sqlserver")
    .option("host", server_name)
    .option("port", "1433")
    .option("user", db_user)
    .option("password", db_password)
    .option("database", metadata_db)
    .option("query", q)
    .load()
  ).collect()[0]['bronzeLocation']

# Function to return the silver location for the given entity:
def entity_silver(e):
  q = f"""
  SELECT silverLocation FROM sourceEntity
  WHERE entityName = '{e}'
  """
  # Create and return dataFrame:
  return (spark.read
    .format("sqlserver")
    .option("host", server_name)
    .option("port", "1433")
    .option("user", db_user)
    .option("password", db_password)
    .option("database", metadata_db)
    .option("query", q)
    .load()
  ).collect()[0]['silverLocation']

In [0]:
def drop_unwanted_columns_transformation(df):
    """
    This function accepts a dataframe as an argument. 
    It reads from the global scope; 'entityIngestionColumns'
    It filters entityIngestionColumns based on whether or not
    the required key has the value False. 
    It collects the columns with the false value and drops them.columns
    The function returns a dataframe.
    """
    # Read from 'entityColumns' to establish unwanted column names:
    drop_cols = [x['columnName'] for x in columns if not x['required']]

    # Iterate thorugh unwanted columns and drop them:
    for col in drop_cols:
        df = df.drop(col)

    return df

def cast_data_types_transformation(df):
    """
    Args: dataframe
    Returns: dataframe

    This function accepts a dataframe as an argument. 
    It reads from the global scope; 'entityIngestionColumns' and
    casts data types based on the dataType key.
    """

    # Determine columns in dataframe:
    df_cols = df.columns

    # Cast correct datatype:
    for col in columns:
        # Check column exists in df_cols:
        if col['columnName'] in df_cols:
            # Cast datatype:
            df = df.withColumn(col['columnName'], df[col['columnName']].cast(type_dict[col['dataType']]))
    return df

def drop_duplicates_transformation(df):
    """
    Arg: dataframe
    Returns: dataframe

    This function takes a dataframe and seeks
    to drop duplicate values.
    """
    # Determine current dataframe columns:
    df_columns = df.columns
    print('Dataframe columns; ', df_columns)
    
    # Iterate through columns and check not primary_key:
    duplicate_columns = [x['columnName'] for x in columns if x['columnName'] in df_columns and not x['primary_key']]
    
    df = df.dropDuplicates(duplicate_columns)
    return df

In [0]:
# Iterate through an object to create a schema:
def create_struct(obj):
    # Map strings to dataTypes:
    type_dict = {'int': IntegerType(), 'string': StringType(), 'decimal': DecimalType(), 'boolean': BooleanType(), 'date': DateType(), 'timestamp': TimestampType()}
    
    # Create Schema
    struct = StructType([
        StructField(value['columnName'], type_dict[value['dataType']], True) 
        for value in obj
    ])
    return struct