In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import trim,col
import os
from pathlib import Path

In [3]:
spark = SparkSession.builder.appName("Read CSV files").getOrCreate()

In [None]:
# csv_files = list(dataset_path.glob("*.csv"))
# for file in csv_files:
#     print(file.name)

In [5]:
base_path = r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse"
dataset_path = os.path.join(base_path, "datasets")
staging_path = os.path.join(base_path, "warehouse", "staging")


csv_files = [f for f in os.listdir(dataset_path) if f.endswith(".csv")]
print("Found CSV files:", csv_files)

def trim_string_columns(df):
    string_cols = [f.name for f in df.schema.fields if f.dataType.simpleString() == "string"]
    for colname in string_cols:
        df = df.withColumn(colname, trim(col(colname)))
    return df

# Loop through all CSVs
for file_name in csv_files:
    try:
        # Derive output folder name (e.g., olist_orders_dataset.csv → orders)
        table_name = file_name.replace("olist_", "").replace("_dataset", "").replace(".csv", "")
        table_path = os.path.join(staging_path, table_name)

        print(f"Processing {file_name} → {table_name}")

        # Read CSV
        df = spark.read.csv(
                os.path.join(dataset_path, file_name),
                header=True,
                inferSchema=True,
                quote='"',
                escape='"',
                multiLine=True
                )


        # Clean column names
        for old_col in df.columns:
            df = df.withColumnRenamed(old_col, old_col.strip().lower())

        # Trim string columns
        df = trim_string_columns(df)

        # Drop duplicates
        df = df.dropDuplicates()

        # Write to Parquet (single file for readability)
        df.coalesce(1).write.mode("overwrite").parquet(table_path)

        print(f"Written to {table_path}")

    except Exception as e:
        print(f"Error processing {file_name}: {e}")

print("\n Staging layer successfully created!")


Found CSV files: ['olist_order_reviews_dataset.csv']
Processing olist_order_reviews_dataset.csv → order_reviews
Written to C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\staging\order_reviews

 Staging layer successfully created!


In [4]:
staged_df = spark.read.parquet(
    r"C:\Users\vasudha.tanniru\Documents\GitHub\data_projects\retail_data_warehouse\warehouse\staging\order_reviews"
)
staged_df.show(5)


+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|           review_id|            order_id|review_score|review_comment_title|review_comment_message|review_creation_date|review_answer_timestamp|
+--------------------+--------------------+------------+--------------------+----------------------+--------------------+-----------------------+
|8a310b56e4d05a778...|c5a47daec61dfde19...|           4|           Recomendo|  O pedido chegou a...| 2018-05-08 00:00:00|    2018-05-10 17:19:17|
|f51a5d398f2374cb2...|dcbec508f4fc19506...|           1|                   6|  Eu não tenho notí...| 2018-05-02 00:00:00|    2018-05-02 11:17:26|
|0e2a9cd112e26220a...|668b09e578d6cdac0...|           5|         Excelente!!|  Produto entregue ...| 2018-07-12 00:00:00|    2018-07-13 11:27:33|
|7da6b99eedc285b7a...|df78dad0e4ef0211f...|           5|               ótimo|  O PRODUTO FOI ENT...| 2018-07-05 00:00:00|   