In [0]:
 from pyspark.sql.functions import *
 from pyspark.sql.types import *

In [0]:
dbutils.widgets.text("incremental_flag",'0')

# CREATE FLAG PARAMETER

In [0]:
incremental_flag = dbutils.widgets.get("incremental_flag")


# CREATING DIMENSION MODEL


### Fetch Relative Columns

In [0]:
# Selecting only the relevant columns from the silver table.
df_src = spark.sql('''select DISTINCT(Date_ID) as Date_ID FROM parquet.`abfss://silver@sumitdatalake.dfs.core.windows.net/carsales`
        ''')

df_src.display()

Date_ID
DT00029
DT00140
DT00192
DT00444
DT00475
DT00947
DT00976
DT01028
DT01099
DT00657


### dim_date Sink - Initial and Incremental (Just Bring the Schema if table NOT EXISTS)


In [0]:
# This is for initial load and incremental loads
if spark.catalog.tableExists('cars_catalog.gold.dim_date') :
    df_sink = spark.sql('''
    SELECT dim_date_key, Date_ID FROM cars_catalog.gold.dim_date
    ''')

else :    
    df_sink = spark.sql('''
    SELECT 1 AS dim_date_key, Date_ID FROM parquet.`abfss://silver@sumitdatalake.dfs.core.windows.net/carsales` WHERE 1 = 0
    ''')


### Filtering new records and old records 

In [0]:
df_filter = df_src.join(df_sink, df_src['Date_ID'] == df_sink['Date_ID'], 'left').select(df_src['Date_ID'], df_sink['dim_date_key'])

In [0]:
df_filter.display()

Date_ID,dim_date_key
DT00029,
DT00140,
DT00192,
DT00444,
DT00475,
DT00947,
DT00976,
DT01028,
DT01099,
DT00657,


### df_filter_old


In [0]:
df_filter_old = df_filter.filter(col('dim_date_key').isNotNull())

In [0]:
df_filter_old.display()

Date_ID,dim_date_key


### df_filter_new


In [0]:
df_filter_new = df_filter.filter(col('dim_date_key').isNull()).select(df_src['Date_ID'])

In [0]:
df_filter_new.display()

Date_ID
DT00029
DT00140
DT00192
DT00444
DT00475
DT00947
DT00976
DT01028
DT01099
DT00657


## Create Surrogate Key

**Fetch the max surrogate key from existing table**

In [0]:
if(incremental_flag == '0'):
    max_value = 1 
else:
    max_value_df = spark.sql("select max(dim_date_key) from cars_catalog.gold.dim_date")
    max_value = max_value_df.collect()[0][0]+1


**Create Surrogate key column and ADD the max surrogate key**

In [0]:
df_filter_new = df_filter_new.withColumn('dim_date_key',max_value+monotonically_increasing_id())

In [0]:
df_filter_new.display()

Date_ID,dim_date_key
DT00029,1
DT00140,2
DT00192,3
DT00444,4
DT00475,5
DT00947,6
DT00976,7
DT01028,8
DT01099,9
DT00657,10


### Create Final DF - df_filter_old + df_filter_new

In [0]:
df_final = df_filter_old.union(df_filter_new)
df_final.display()

Date_ID,dim_date_key
DT00029,1
DT00140,2
DT00192,3
DT00444,4
DT00475,5
DT00947,6
DT00976,7
DT01028,8
DT01099,9
DT00657,10


## SCD TYPE -1(UPSERT)

In [0]:
from delta.tables import DeltaTable

In [0]:
#Incremental Run
if spark.catalog.tableExists('cars_catalog.gold.dim_date'):
    delta_tbl = DeltaTable.forPath(spark,"abfss://gold@sumitdatalake.dfs.core.windows.net/dim_date")

    delta_tbl.alias("trg").merge(df_final.alias("src"), "trg.dim_date_key = src.dim_date_key")\
        .whenMatchedUpdateAll()\
        .whenNotMatchedInsertAll()\
        .execute()


#Initial RUN
else:
    df_final.write.format('delta')\
    .mode('overwrite')\
    .option("path","abfss://gold@sumitdatalake.dfs.core.windows.net/dim_date")\
    .saveAsTable('cars_catalog.gold.dim_date')

In [0]:
%sql
select * from cars_catalog.gold.dim_date

Date_ID,dim_date_key
DT00029,1
DT00140,2
DT00192,3
DT00444,4
DT00475,5
DT00947,6
DT00976,7
DT01028,8
DT01099,9
DT00657,10


In [0]:
%sql
select * from cars_catalog.gold.factsales

Revenue,Units_Sold,RevPerUnit,dim_branch_key,dim_dealer_key,dim_date_key,dim_model_key
13363978,2,26727956,418,6,825,155
17376468,3,52129404,1557,197,825,252
9664767,3,28994301,1058,104,752,199
5525304,3,16575912,789,95,752,183
12971088,3,38913264,497,231,882,106
7321228,1,7321228,1804,41,988,41
11379294,2,22758588,734,177,988,107
11611234,2,23222468,1211,182,1043,110
19979446,2,39958892,116,204,1043,185
14181510,3,42544530,116,160,826,238
