In [0]:
# %sql

# create catalog if not exists data_modeling;
# use catalog data_modeling;

# drop schema if exists raw cascade;
# drop schema if exists bronze cascade;
# drop schema if exists silver cascade;
# drop schema if exists gold cascade;

# create schema raw;
# create schema bronze;
# create schema silver;
# create schema gold;

# create volume data_modeling.raw.landing_zone;

In [0]:
# # Reference the volume
# landing_zone_path = "/Volumes/data_modeling/raw/landing_zone"

# # Create subdirectories
# dbutils.fs.mkdirs(f"{landing_zone_path}/cust")
# dbutils.fs.mkdirs(f"{landing_zone_path}/product")
# dbutils.fs.mkdirs(f"{landing_zone_path}/loc")
# dbutils.fs.mkdirs(f"{landing_zone_path}/sales")

# # List contents
# dbutils.fs.ls(landing_zone_path)

In [0]:
%sql
use catalog data_modeling;

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, DoubleType
from datetime import datetime
from pyspark.sql.functions import col, current_timestamp, lit, trim
from pyspark.sql import DataFrame
import re


In [0]:
from datetime import datetime
date_var = datetime.now().strftime('%Y%m%d')
print(date_var)

In [0]:
sales_df = spark.read.format('csv').option('header','true').option('inferSchema','true').load(f'/Volumes/data_modeling/raw/landing_zone/sales_{date_var}.csv')

sales_df.display()

In [0]:
def format_columns_for_delta(df: DataFrame, naming_convention: str = "snake_case") -> DataFrame:
    """Convert DataFrame columns to Delta-compliant format"""
    
    def to_snake_case(name: str) -> str:
        name = re.sub(r'[\s\-]+', '_', name)
        name = re.sub(r'[,;{}\(\)\n\t=]', '', name)
        name = re.sub(r'([a-z])([A-Z])', r'\1_\2', name)
        return name.lower().strip('_')
    
    def to_camel_case(name: str) -> str:
        name = to_snake_case(name)
        parts = name.split('_')
        return parts[0] + ''.join(word.capitalize() for word in parts[1:])
    
    def to_pascal_case(name: str) -> str:
        """Convert to PascalCase - All words capitalized"""
        name = to_snake_case(name)
        parts = name.split('_')
        return ''.join(word.capitalize() for word in parts)
    
    def to_lowercase(name: str) -> str:
        name = re.sub(r'[\s\-]+', '_', name)
        name = re.sub(r'[,;{}\(\)\n\t=]', '', name)
        return name.lower().strip('_')
    
    converters = {
        "snake_case": to_snake_case,
        "camelCase": to_camel_case,
        "PascalCase": to_pascal_case,
        "lowercase": to_lowercase
    }
    
    converter = converters.get(naming_convention, to_snake_case)
    df_formatted = df
    
    for old_name in df.columns:
        new_name = converter(old_name)
        if old_name != new_name:
            df_formatted = df_formatted.withColumnRenamed(old_name, new_name)
    
    return df_formatted

sales_df = format_columns_for_delta(sales_df, naming_convention="PascalCase")

sales_df.display()

In [0]:
sales_df.write.format(
    'delta'
).mode(
    'overwrite'
).option(
    'overwriteSchema',
    'true'
).saveAsTable(
    'bronze.raw_sales_data'
)