In [0]:
%sql
CREATE CATALOG IF NOT EXISTS catalog_name;
CREATE SCHEMA IF NOT EXISTS catalog_name.ecommerce;
CREATE VOLUME IF NOT EXISTS catalog_name.ecommerce.lakehouse_Volumes


In [0]:
%sql
-- cascade is for delete all the schemas, volumes and tables under catalog
DROP CATALOG IF EXISTS catalog_name CASCADE;

**create catalog--> main, schema-->ecommerce, and managed_vol-->lakehouse_volumes**

In [0]:
%sql
CREATE CATALOG IF NOT EXISTS main;
CREATE SCHEMA IF NOT EXISTS main.ecommerce;
CREATE VOLUME IF NOT EXISTS main.ecommerce.lakehouse_Volumes;

**Create directories raw, bronze, silver and gold datasets under managed volumes**

In [0]:
dbutils.fs.mkdirs('/Volumes/main/ecommerce/lakehouse_volumes/raw_dataset');
dbutils.fs.mkdirs('/Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset');
dbutils.fs.mkdirs('/Volumes/main/ecommerce/lakehouse_volumes/silver_dataset');
dbutils.fs.mkdirs('/Volumes/main/ecommerce/lakehouse_volumes/gold_dataset');

**Manually upload all CSV files in raw_datasets**

### Task 1 â€“ Raw Data Ingestion (Initial Load)
Objective:  Standardize raw data storage and prepare it for downstream processing.
- Read raw CSV files from Raw_DS
- Convert each dataset into Parquet format
- Store Parquet files inside the bronze folder
- Create one folder per dataset
- Do NOT apply any transformations or business logic

**Reading and writing customer data**

In [0]:
df_raw_cust = spark.read.option("header", True).csv("/Volumes/main/ecommerce/lakehouse_volumes/raw_dataset/olist_customers_dataset.csv")

In [0]:
df_raw_cust.printSchema()

In [0]:
df_raw_cust.write.mode("overwrite").parquet("/Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset/customer/")

In [0]:

# note : to check number for partition it created
from pyspark.sql.functions import spark_partition_id

df_raw_cust.withColumn("PartitionId",spark_partition_id())\
                     .groupBy("PartitionId")\
                     .count().show()

**Reading and writing orders_items data**

In [0]:
df_raw_orders_items = spark.read.option("header",True).csv("/Volumes/main/ecommerce/lakehouse_volumes/raw_dataset/olist_order_items_dataset.csv")

In [0]:
from pyspark.sql.functions import to_timestamp,col;

##df_raw_orders_items = df_raw_orders_items.withColumn("shipping_limit_date", to_timestamp(col("shipping_limit_date"), "yyyy-MM-dd HH:mm:ss"))

In [0]:
df_raw_orders_items.write.mode("overwrite").parquet("/Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset/orders_items/")

In [0]:
df_raw_orders_items.printSchema()

In [0]:
# note : to check number for partition it created
from pyspark.sql.functions import spark_partition_id

df_raw_orders_items.withColumn("PartitionId",spark_partition_id())\
                     .groupBy("PartitionId")\
                     .count().show()

**Reading and writing order_payments data**

In [0]:
df_raw_order_payment = spark.read.option("header",True).csv("/Volumes/main/ecommerce/lakehouse_volumes/raw_dataset/olist_order_payments_dataset.csv")

In [0]:
df_raw_order_payment.printSchema()

In [0]:
df_raw_order_payment.write.mode("overwrite").parquet("/Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset/order_payments/")

**Reading and writing orders data**

In [0]:
df_raw_orders = spark.read.option("header",True).csv("/Volumes/main/ecommerce/lakehouse_volumes/raw_dataset/olist_orders_dataset.csv")
df_raw_orders.printSchema()

In [0]:
df_raw_orders.withColumn("PartitionID",spark_partition_id())\
                     .groupBy("PartitionID")\
                     .count().show()

In [0]:
df_raw_orders.write.mode("overwrite").parquet("/Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset/orders/")

**Reading and writing products data**

In [0]:
df_raw_products = spark.read.option("header",True).csv("/Volumes/main/ecommerce/lakehouse_volumes/raw_dataset/olist_products_dataset.csv")
df_raw_products.printSchema()

In [0]:
df_raw_products.withColumn("PartitionId",spark_partition_id())\
                     .groupBy("PartitionId")\
                     .count().show()    

In [0]:
df_raw_products.write.mode("overwrite").parquet("/Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset/products/")

In [0]:
%fs ls /Volumes/main/ecommerce/lakehouse_volumes/bronze_dataset/

In [0]:
df_raw_cust.count()