## Intial notebook to clean and transform bronze layer data into a refined silver layer with feature engineering

In [0]:
import os

os.environ['BUCKET_NAME'] = 'databricks-745bwkyiddeq9fthttjahg-cloud-storage-bucket'
bucket_name = os.getenv('BUCKET_NAME')
print(bucket_name)


In [0]:
from pyspark.sql.functions import col, current_timestamp, to_date, when

bronze_path = f"s3://{bucket_name}/ohio-prod/3903799048317088/mnt/bronze/daily_activity"
df_bronze = spark.read.format("delta").load(bronze_path)

df_silver = (df_bronze
    .withColumn("ActivityDate", to_date(col("ActivityDate"), "M/d/yyyy"))
    .withColumn("activity_level",
                when(col("TotalSteps") >= 10000, "Very Active")
                .when((col("TotalSteps") >= 5000) & (col("TotalSteps") < 10000), "Moderately Active")
                .otherwise("Lightly Active"))
    .withColumn("ingestion_timestamp", current_timestamp())
    .dropna(subset=["Id", "ActivityDate", "TotalSteps", "Calories"])  # Key fields are required
)

df_silver = df_silver.dropDuplicates(["Id", "ActivityDate"])

silver_path = f"s3://{bucket_name}/ohio-prod/3903799048317088/mnt/silver/daily_activity_clean"

(df_silver.write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save(silver_path))

print("Silver Delta Table successfully created!")

### CREATE DATABASE & TABLE

In [0]:
%sql
CREATE DATABASE IF NOT EXISTS fitness_silver
MANAGED LOCATION 's3://databricks-745bwkyiddeq9fthttjahg-cloud-storage-bucket/ohio-prod/3903799048317088/mnt/silver/';

In [0]:
%sql
CREATE TABLE IF NOT EXISTS fitness_silver.daily_activity_clean
LOCATION 's3://databricks-745bwkyiddeq9fthttjahg-cloud-storage-bucket/ohio-prod/3903799048317088/mnt/silver/daily_activity_clean/';

In [0]:
%sql
DESCRIBE EXTENDED fitness_silver.daily_activity_clean;

In [0]:
%sql
SELECT * FROM fitness_silver.daily_activity_clean;

In [0]:
%sql
DESCRIBE HISTORY fitness_silver.daily_activity_clean;

In [0]:
silver_files = dbutils.fs.ls(silver_path)
display(silver_files)