In [0]:
from pyspark.sql import functions as F
from delta.tables import DeltaTable

In [0]:
%run /Workspace/Users/himabindut9715@gmail.com/databricks_project/1_setup/utilities

In [0]:
print(bronze_schema,silver_schema,gold_schema)

bronze silver gold


In [0]:
dbutils.widgets.text("catalog","fmcg","Catalog")
dbutils.widgets.text("data_source","customers","Data Source")

In [0]:
catalog= dbutils.widgets.get("catalog")
data_source=dbutils.widgets.get("data_source")

base_path=f's3://sports-bar-hb/{data_source}/*.csv'
print(base_path)

s3://sports-bar-hb/customers/*.csv


In [0]:
df=(spark.read.format("csv")
            .option("header",True)
            .option("inferSchema",True)
            .load(base_path)
            .withColumn("read_timestamp",F.current_timestamp())
            .select("*","_metadata.file_name","_metadata.file_size")
)
display(df.limit(10))


customer_id,customer_name,city,read_timestamp,file_name,file_size
789201,FitFuel Market,Bengaluru,2025-12-16T16:38:02.086Z,customers.csv,1404
789202,FitFuel Market,Hyderabad,2025-12-16T16:38:02.086Z,customers.csv,1404
789203,FitFuel Market,New Delhi,2025-12-16T16:38:02.086Z,customers.csv,1404
789301,Athlete's Choice Store,Bengaluru,2025-12-16T16:38:02.086Z,customers.csv,1404
789303,Athlete's Choice Store,New Delhi,2025-12-16T16:38:02.086Z,customers.csv,1404
789101,Endurance Foods,Bengalore,2025-12-16T16:38:02.086Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2025-12-16T16:38:02.086Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2025-12-16T16:38:02.086Z,customers.csv,1404
789121,HydroBoost Nutrition,Hyderabad,2025-12-16T16:38:02.086Z,customers.csv,1404
789122,HydroBoost Nutrition,New Delhi,2025-12-16T16:38:02.086Z,customers.csv,1404


In [0]:
df.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = false)
 |-- file_name: string (nullable = false)
 |-- file_size: long (nullable = false)



In [0]:
df.write\
    .format("delta")\
    .option("delta.enableChangeDataFeed","true") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.{bronze_schema}.{data_source}")

### Silver Processing

In [0]:
df_bronze=spark.sql(f"select * from {catalog}.{bronze_schema}.{data_source};")
df_bronze.show(10)

+-----------+--------------------+---------+--------------------+-------------+---------+
|customer_id|       customer_name|     city|      read_timestamp|    file_name|file_size|
+-----------+--------------------+---------+--------------------+-------------+---------+
|     789201|      FitFuel Market|Bengaluru|2025-12-16 16:38:...|customers.csv|     1404|
|     789202|      FitFuel Market|Hyderabad|2025-12-16 16:38:...|customers.csv|     1404|
|     789203|      FitFuel Market|New Delhi|2025-12-16 16:38:...|customers.csv|     1404|
|     789301|Athlete's Choice ...|Bengaluru|2025-12-16 16:38:...|customers.csv|     1404|
|     789303|Athlete's Choice ...|New Delhi|2025-12-16 16:38:...|customers.csv|     1404|
|     789101|     Endurance Foods|Bengalore|2025-12-16 16:38:...|customers.csv|     1404|
|     789102|     Endurance Foods|Hyderabad|2025-12-16 16:38:...|customers.csv|     1404|
|     789103|     Endurance Foods|New Delhi|2025-12-16 16:38:...|customers.csv|     1404|
|     7891

In [0]:
df_bronze.printSchema()

root
 |-- customer_id: integer (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)



In [0]:
df_duplicates=df_bronze.groupby("customer_id").count().filter(F.col("count")>1)
display(df_duplicates)

customer_id,count
789321,2
789503,2
789522,2
789603,2


In [0]:
print('Rows before duplicates dropped: ',df_bronze.count())
df_silver=df_bronze.dropDuplicates(["customer_id"])
print('Rows after duplicates dropped: ',df_silver.count())

Rows before duplicates dropped:  39
Rows after duplicates dropped:  35


In [0]:
display(
    df_silver.filter(F.col("customer_name")!=F.trim(F.col("customer_name")))
)

customer_id,customer_name,city,read_timestamp,file_name,file_size
789121,HydroBoost Nutrition,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404
789401,SprintX nutrition,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789420,ZenAthlete foods,,2025-12-16T16:38:12.881Z,customers.csv,1404
789421,ZenAthlete Foods,Hyderbad,2025-12-16T16:38:12.881Z,customers.csv,1404
789521,PrimeFuel Nutrition,,2025-12-16T16:38:12.881Z,customers.csv,1404
789702,StaminaX Store,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404


In [0]:
df_silver=df_silver.withColumn("customer_name",F.trim(F.col("customer_name")))
display(df_silver)

customer_id,customer_name,city,read_timestamp,file_name,file_size
789201,FitFuel Market,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789202,FitFuel Market,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404
789203,FitFuel Market,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404
789301,Athlete's Choice Store,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789303,Athlete's Choice Store,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404
789101,Endurance Foods,Bengalore,2025-12-16T16:38:12.881Z,customers.csv,1404
789102,Endurance Foods,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404
789103,Endurance Foods,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404
789121,HydroBoost Nutrition,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404
789122,HydroBoost Nutrition,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404


In [0]:
df_silver.select('city').distinct().show()

+----------+
|      city|
+----------+
| Bengaluru|
| Hyderabad|
| New Delhi|
| Bengalore|
|Hyderabadd|
|      NULL|
|  Hyderbad|
| NewDelhee|
|  NewDelhi|
|Bengaluruu|
|  NewDheli|
+----------+



In [0]:
# typos=correct_names

city_mappings={
    'Bengaluruu':'Bengaluru',
    'Bengalore':'Bengaluru',
    
    'Hyderabadd':'Hyderbad',
    'Hyderbad':'Hyderabad',
    
    'NewDelhi':'New Delhi',
    'NewDelhee':'New Delhi',
    'NewDeli':'New Delhi'
    }

allowed = ['Bengaluru','Hyderabad','New Delhi']

df_silver=(
    df_silver
    .replace(city_mappings,subset=["city"])
    .withColumn("city",
           F.when(F.col("city").isNull(),None)
           .when(F.col("city").isin(allowed),F.col("city"))
           .otherwise(F.lit(None)))
        )
       

df_silver.select('city').distinct().show()




+---------+
|     city|
+---------+
|Bengaluru|
|Hyderabad|
|New Delhi|
|     NULL|
+---------+



In [0]:
df_silver.select('customer_name').distinct().show()

+--------------------+
|       customer_name|
+--------------------+
|      FitFuel Market|
|Athlete's Choice ...|
|     Endurance Foods|
|HydroBoost Nutrition|
|MacroBite Superfoods|
|MacroBite superfoods|
|      PowerSnack Hub|
|      PowerSnack hub|
|   SprintX nutrition|
|   SprintX Nutrition|
|    ZenAthlete foods|
|    ZenAthlete Foods|
|Peak performance ...|
|Peak Performance ...|
| PrimeFuel Nutrition|
|       Recovery Lane|
|      StaminaX Store|
|EliteAthlete Nutr...|
|      GamePlan Foods|
|   Champion's choice|
+--------------------+
only showing top 20 rows


In [0]:
df_silver=df_silver.withColumn(
    "customer_name",
    F.when(F.col("customer_name").isNull(),None)
    .otherwise(F.initcap("customer_name"))
)
df_silver.select("customer_name").distinct().show()

+--------------------+
|       customer_name|
+--------------------+
|      Fitfuel Market|
|Athlete's Choice ...|
|     Endurance Foods|
|Hydroboost Nutrition|
|Macrobite Superfoods|
|      Powersnack Hub|
|   Sprintx Nutrition|
|    Zenathlete Foods|
|Peak Performance ...|
| Primefuel Nutrition|
|       Recovery Lane|
|      Staminax Store|
|Eliteathlete Nutr...|
|      Gameplan Foods|
|   Champion's Choice|
+--------------------+



In [0]:
df_silver.filter(F.col("city").isNull()).show(truncate=False)

+-----------+--------------------+----+--------------------------+-------------+---------+
|customer_id|customer_name       |city|read_timestamp            |file_name    |file_size|
+-----------+--------------------+----+--------------------------+-------------+---------+
|789221     |Macrobite Superfoods|NULL|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789403     |Sprintx Nutrition   |NULL|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789420     |Zenathlete Foods    |NULL|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789521     |Primefuel Nutrition |NULL|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789522     |Primefuel Nutrition |NULL|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789603     |Recovery Lane       |NULL|2025-12-16 16:38:12.881387|customers.csv|1404     |
+-----------+--------------------+----+--------------------------+-------------+---------+



In [0]:
null_customer_names=['Sprintx Nutrition','Zenathlete Foods','Primefuel Nutrition','Recovery Lane']
df_silver.filter(F.col("customer_name").isin(null_customer_names)).show(truncate=False)

+-----------+-------------------+---------+--------------------------+-------------+---------+
|customer_id|customer_name      |city     |read_timestamp            |file_name    |file_size|
+-----------+-------------------+---------+--------------------------+-------------+---------+
|789401     |Sprintx Nutrition  |Bengaluru|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789402     |Sprintx Nutrition  |Hyderabad|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789403     |Sprintx Nutrition  |NULL     |2025-12-16 16:38:12.881387|customers.csv|1404     |
|789420     |Zenathlete Foods   |NULL     |2025-12-16 16:38:12.881387|customers.csv|1404     |
|789421     |Zenathlete Foods   |Hyderabad|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789422     |Zenathlete Foods   |New Delhi|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789520     |Primefuel Nutrition|Bengaluru|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789521     |Primefuel Nutrition|NULL     |2025-12

In [0]:
customer_city_fix={
    #Sprintix Nutrition
    789403: "New Delhi",

    #Zenathlete Foods
    789420: "Bengaluru",

    #Primefuel Nutrition
    789521: "Hyderabad",

    789522: "New Delhi",

    #Recovery Lane
    789603: "Hyderabad",

    789221: "Hyderabad"

}

df_fix=spark.createDataFrame(
    [(k,v) for k,v in customer_city_fix.items()],
    ["customer_id","fixed_city"]
)

display(df_fix)

customer_id,fixed_city
789403,New Delhi
789420,Bengaluru
789521,Hyderabad
789522,New Delhi
789603,Hyderabad
789221,Hyderabad


In [0]:
df_silver=(
    df_silver.join(df_fix,on="customer_id",how="left")
    .withColumn("city",
                F.coalesce(F.col("city"),F.col("Fixed_city"))
                )
    .drop("fixed_city")

)
display(df_silver)

customer_id,customer_name,city,read_timestamp,file_name,file_size
789503,Peak Performance Store,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404
789420,Zenathlete Foods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789703,Staminax Store,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404
789621,Eliteathlete Nutrition,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404
789101,Endurance Foods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789220,Macrobite Superfoods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789720,Gameplan Foods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789601,Recovery Lane,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404
789122,Hydroboost Nutrition,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404
789402,Sprintx Nutrition,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404


In [0]:
null_customer_names=['Sprintx Nutrition','Zenathlete Foods','Primefuel Nutrition','Recovery Lane']
df_silver.filter(F.col("customer_name").isin(null_customer_names)).show(truncate=False)

+-----------+-------------------+---------+--------------------------+-------------+---------+
|customer_id|customer_name      |city     |read_timestamp            |file_name    |file_size|
+-----------+-------------------+---------+--------------------------+-------------+---------+
|789420     |Zenathlete Foods   |Bengaluru|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789601     |Recovery Lane      |Bengaluru|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789402     |Sprintx Nutrition  |Hyderabad|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789603     |Recovery Lane      |Hyderabad|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789401     |Sprintx Nutrition  |Bengaluru|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789421     |Zenathlete Foods   |Hyderabad|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789520     |Primefuel Nutrition|Bengaluru|2025-12-16 16:38:12.881387|customers.csv|1404     |
|789522     |Primefuel Nutrition|New Delhi|2025-12

In [0]:
df_silver=df_silver.withColumn("customer_id",F.col("customer_id").cast("string"))
print(df_silver.printSchema())

root
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- read_timestamp: timestamp (nullable = true)
 |-- file_name: string (nullable = true)
 |-- file_size: long (nullable = true)

None


In [0]:
df_silver=(
           df_silver
#Build final customer column: "customer-name-city" or "customer-name-unknown"
            .withColumn(
                "customer",
                F.concat_ws("_","customer_name",F.coalesce(F.col("city"),F.lit("unknown")))
            )
            # Static attributes alignes with parent data model
            .withColumn("market",F.lit("India"))
            .withColumn("platform",F.lit("Sports Bar"))
            .withColumn("channel",F.lit("Acquisition"))
)
df_silver.display()

customer_id,customer_name,city,read_timestamp,file_name,file_size,customer,market,platform,channel
789503,Peak Performance Store,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404,Peak Performance Store_New Delhi,India,Sports Bar,Acquisition
789420,Zenathlete Foods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404,Zenathlete Foods_Bengaluru,India,Sports Bar,Acquisition
789703,Staminax Store,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404,Staminax Store_New Delhi,India,Sports Bar,Acquisition
789621,Eliteathlete Nutrition,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404,Eliteathlete Nutrition_Hyderabad,India,Sports Bar,Acquisition
789101,Endurance Foods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404,Endurance Foods_Bengaluru,India,Sports Bar,Acquisition
789220,Macrobite Superfoods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404,Macrobite Superfoods_Bengaluru,India,Sports Bar,Acquisition
789720,Gameplan Foods,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404,Gameplan Foods_Bengaluru,India,Sports Bar,Acquisition
789601,Recovery Lane,Bengaluru,2025-12-16T16:38:12.881Z,customers.csv,1404,Recovery Lane_Bengaluru,India,Sports Bar,Acquisition
789122,Hydroboost Nutrition,New Delhi,2025-12-16T16:38:12.881Z,customers.csv,1404,Hydroboost Nutrition_New Delhi,India,Sports Bar,Acquisition
789402,Sprintx Nutrition,Hyderabad,2025-12-16T16:38:12.881Z,customers.csv,1404,Sprintx Nutrition_Hyderabad,India,Sports Bar,Acquisition


In [0]:
df_silver.write\
    .format("delta")\
    .option("delta.enableChangeDataFeed","true") \
    .option("mergeSchema","true") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.{silver_schema}.{data_source}")

### Gold Processing

In [0]:
df_silver=spark.sql(f"SELECT * FROM {catalog}.{silver_schema}.{data_source};")

# take req cols only
# customers_id,customer_name,city,read_timestamp,file_name,file_size,customer,market,platform,channel"
df_gold=df_silver.select("customer_id","customer_name","city","customer","market","platform","channel")
df_gold.display()

customer_id,customer_name,city,customer,market,platform,channel
789503,Peak Performance Store,New Delhi,Peak Performance Store_New Delhi,India,Sports Bar,Acquisition
789420,Zenathlete Foods,Bengaluru,Zenathlete Foods_Bengaluru,India,Sports Bar,Acquisition
789703,Staminax Store,New Delhi,Staminax Store_New Delhi,India,Sports Bar,Acquisition
789621,Eliteathlete Nutrition,Hyderabad,Eliteathlete Nutrition_Hyderabad,India,Sports Bar,Acquisition
789101,Endurance Foods,Bengaluru,Endurance Foods_Bengaluru,India,Sports Bar,Acquisition
789220,Macrobite Superfoods,Bengaluru,Macrobite Superfoods_Bengaluru,India,Sports Bar,Acquisition
789720,Gameplan Foods,Bengaluru,Gameplan Foods_Bengaluru,India,Sports Bar,Acquisition
789601,Recovery Lane,Bengaluru,Recovery Lane_Bengaluru,India,Sports Bar,Acquisition
789122,Hydroboost Nutrition,New Delhi,Hydroboost Nutrition_New Delhi,India,Sports Bar,Acquisition
789402,Sprintx Nutrition,Hyderabad,Sprintx Nutrition_Hyderabad,India,Sports Bar,Acquisition


In [0]:
df_gold.write \
    .format("delta") \
    .option("delta.enableChangeDataFeed","true") \
    .mode("overwrite") \
    .saveAsTable(f"{catalog}.{gold_schema}.sb_dim_{data_source}")

In [0]:
from delta.tables import DeltaTable
from pyspark.sql import functions as F

delta_table=DeltaTable.forName(spark,"fmcg.gold.dim_customers")
df_child_customers=spark.table("fmcg.gold.sb_dim_customers").select(
                        F.col("customer_id").alias("customer_code"),
                                "customer",
                                "market",
                                "platform",
                                "channel"

)



In [0]:
delta_table.alias("target").merge(
    source=df_child_customers.alias("source"),
    condition="target.customer_code=source.customer_code"
).whenMatchedUpdateAll().whenNotMatchedInsertAll().execute()

DataFrame[num_affected_rows: bigint, num_updated_rows: bigint, num_deleted_rows: bigint, num_inserted_rows: bigint]