In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, year, month, dayofmonth, to_date, to_timestamp
import datetime

In [3]:
# Initialize Spark Session
spark = SparkSession.builder.appName("FacebookAdsDimAdsetSilver").getOrCreate()

In [4]:
# Define Storage Account & Container
storage_account_name = "learningstorage1093"
container_name = "learning"
bronze_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/bronze/facebook_ads/dim_adset/historical/"
silver_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/silver/facebook_ads/dim_adset/"


In [5]:
# Load raw data from Bronze Layer
df_bronze = spark.read.json(bronze_path)

In [6]:
print("Data Schema")
df_bronze.printSchema()

In [8]:
df_silver = df_bronze.select(
    col("adset_id").alias("adset_id"),  # Primary Key
    col("adset_name").alias("adset_name"),
    col("adset_status").alias("adset_status"),
    col("adset_daily_budget").cast("float").alias("adset_daily_budget"),  # Convert to FLOAT
    
    # Convert timestamps to correct format
    to_timestamp(col("adset_created_time"), "yyyy-MM-dd'T'HH:mm:ssX").alias("adset_created_timestamp"),
    to_timestamp(col("adset_start_time"), "yyyy-MM-dd'T'HH:mm:ssX").alias("adset_start_timestamp"),
    to_timestamp(col("adset_end_time"), "yyyy-MM-dd'T'HH:mm:ssX").alias("adset_end_timestamp"),

    # Metadata Fields
    col("load_date").cast("timestamp").alias("load_date"),
    col("source").alias("source")
    )

In [9]:
# Partitioning by Year of `adset_created_timestamp`
df_silver = df_silver.withColumn("year", year(col("adset_created_timestamp")))

In [10]:
df_silver.printSchema()

In [12]:
print("Sample Data:")
display(df_silver)

In [13]:
# Save Transformed Data to Silver Layer in Parquet format
print("🚀 Saving `dim_adset` to Silver Layer...")
df_silver.write.mode("overwrite").partitionBy("year").parquet(silver_path)
print(f"✅ Successfully saved `dim_adset` to {silver_path}")