In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim,col
import os
from pathlib import Path

In [2]:
spark = SparkSession.builder.appName("Read CSV files").getOrCreate()

In [14]:
csv_files = list(dataset_path.glob("*.csv"))
for file in csv_files:
    print(file.name)

olist_customers_dataset.csv
olist_geolocation_dataset.csv
olist_orders_dataset.csv
olist_order_items_dataset.csv
olist_order_payments_dataset.csv
olist_order_reviews_dataset.csv
olist_products_dataset.csv
olist_sellers_dataset.csv
product_category_name_translation.csv


In [18]:
base_path = r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse"
dataset_path = os.path.join(base_path, "datasets")
staging_path = os.path.join(base_path, "warehouse", "staging")


csv_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv")]
print("Found CSV files:", csv_files)

def trim_string_columns(df):
    string_cols = [f.name for f in df.schema.fields if f.dataType.simpleString() == "string"]
    for colname in string_cols:
        df = df.withColumn(colname, trim(col(colname)))
    return df

# Loop through all CSVs
for file_name in csv_files:
    try:
        # Derive output folder name (e.g., olist_orders_dataset.csv ‚Üí orders)
        table_name = file_name.replace("olist_", "").replace("_dataset", "").replace(".csv", "")
        table_path = os.path.join(staging_path, table_name)

        print(f"Processing {file_name} ‚Üí {table_name}")

        # Read CSV
        df = spark.read.csv(
            os.path.join(dataset_path, file_name),
            header=True,
            inferSchema=True
        )

        # Clean column names
        for old_col in df.columns:
            df = df.withColumnRenamed(old_col, old_col.strip().lower())

        # Trim string columns
        df = trim_string_columns(df)

        # Drop duplicates
        df = df.dropDuplicates()

        # Write to Parquet (single file for readability)
        df.coalesce(1).write.mode("overwrite").parquet(table_path)

        print(f"Written to {table_path}")

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

print("\n Staging layer successfully created!")


üìÇ Found CSV files: ['olist_customers_dataset.csv', 'olist_geolocation_dataset.csv', 'olist_orders_dataset.csv', 'olist_order_items_dataset.csv', 'olist_order_payments_dataset.csv', 'olist_order_reviews_dataset.csv', 'olist_products_dataset.csv', 'olist_sellers_dataset.csv', 'product_category_name_translation.csv']
‚ñ∂Ô∏è Processing olist_customers_dataset.csv ‚Üí customers
‚úÖ Written to C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\staging\customers
‚ñ∂Ô∏è Processing olist_geolocation_dataset.csv ‚Üí geolocation
‚úÖ Written to C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\staging\geolocation
‚ñ∂Ô∏è Processing olist_orders_dataset.csv ‚Üí orders
‚úÖ Written to C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\staging\orders
‚ñ∂Ô∏è Processing olist_order_items_dataset.csv ‚Üí order_items
‚úÖ Written to C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail