# 1. Import necess lib and create Spark Session

In [0]:
# pyspark
import pyspark

# SparkSession
from pyspark.sql import SparkSession

# function
import pyspark.sql.functions as F
from pyspark.sql.types import *

In [0]:
# Create SparkSession
spark = SparkSession.builder.master("local[*]") \
                            .appName("Walmart_DW") \
                            .getOrCreate()

# Check spark version
print("Spark version: ", spark.version)

Spark version:  3.5.0


## 2. Extract dataset and Transform

In [0]:
# Define file_path, type
file_path = "dbfs:/FileStore/tables/WalmartSalesData.csv"
file_type = "csv"

# Read dataset into dataframe
walmartsales_df = spark.read.format(file_type) \
                            .option("header", True) \
                            .option("inferSchema", True) \
                            .load(file_path)

# Display dataframe
display(walmartsales_df)

Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,2024-03-17T13:08:00Z,Ewallet,522.83,4.761904762,26.1415,9.1
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,2024-03-17T10:29:00Z,Cash,76.4,4.761904762,3.82,9.6
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,2024-03-17T13:23:00Z,Credit card,324.31,4.761904762,16.2155,7.4
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,2019-01-27,2024-03-17T20:33:00Z,Ewallet,465.76,4.761904762,23.288,8.4
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,2024-03-17T10:37:00Z,Ewallet,604.17,4.761904762,30.2085,5.3
699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,2019-03-25,2024-03-17T18:30:00Z,Ewallet,597.73,4.761904762,29.8865,4.1
355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2019-02-25,2024-03-17T14:36:00Z,Ewallet,413.04,4.761904762,20.652,5.8
315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2019-02-24,2024-03-17T11:38:00Z,Ewallet,735.6,4.761904762,36.78,8.0
665-32-9167,A,Yangon,Member,Female,Health and beauty,36.26,2,3.626,76.146,2019-01-10,2024-03-17T17:15:00Z,Credit card,72.52,4.761904762,3.626,7.2
692-92-5582,B,Mandalay,Member,Female,Food and beverages,54.84,3,8.226,172.746,2019-02-20,2024-03-17T13:27:00Z,Credit card,164.52,4.761904762,8.226,5.9


### 2.1 Rename column
**Column renamed: Customer type, Product line, Unit price, Tax 5%,gross margin percentage, gross income**

In [0]:
# Method 1
walmartsales_df = walmartsales_df.withColumnRenamed("Invoice ID", "Invoice_ID")
walmartsales_df = walmartsales_df.withColumnRenamed("Customer type", "Customer_type")
walmartsales_df = walmartsales_df.withColumnRenamed("Product line", "Product_line")
walmartsales_df = walmartsales_df.withColumnRenamed("Unit price", "Unit_price")
walmartsales_df = walmartsales_df.withColumnRenamed("Tax 5%", "Tax_5%")
walmartsales_df = walmartsales_df.withColumnRenamed("gross margin percentage", "gross_margin_percentage")
walmartsales_df = walmartsales_df.withColumnRenamed("cogs", "Cogs")
walmartsales_df = walmartsales_df.withColumnRenamed("gross income", "gross_income")

In [0]:
# Method 2
walmartsales_df = walmartsales_df.withColumnRenamed("Invoice ID", "Invoice_ID") \
                                 .withColumnRenamed("Customer type", "Customer_type") \
                                 .withColumnRenamed("Product line", "Product_line") \
                                 .withColumnRenamed("Unit price", "Unit_price") \
                                 .withColumnRenamed("Tax 5%", "Tax_5%") \
                                 .withColumnRenamed("gross margin percentage", "gross_margin_percentage") \
                                 .withColumnRenamed("cogs", "Cogs") \
                                 .withColumnRenamed("gross income", "gross_income")

### 2.2. Create new column

In [0]:
# Method 1
walmartsales_df = walmartsales_df.withColumn("Day", F.day(F.col("Date"))) \
                                 .withColumn("Month", F.month(F.col("Date"))) \
                                 .withColumn("Year", F.year(F.col("Date")))

# Display df
display(walmartsales_df)

Invoice_ID,Branch,City,Customer_type,Gender,Product_line,Unit_price,Quantity,Tax_5%,Total,Date,Time,Payment,Cogs,gross_margin_percentage,gross_income,Rating,Day,Month,Year
750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,2019-01-05,2024-03-17T13:08:00Z,Ewallet,522.83,4.761904762,26.1415,9.1,5,1,2019
226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,2019-03-08,2024-03-17T10:29:00Z,Cash,76.4,4.761904762,3.82,9.6,8,3,2019
631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,2019-03-03,2024-03-17T13:23:00Z,Credit card,324.31,4.761904762,16.2155,7.4,3,3,2019
123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,2019-01-27,2024-03-17T20:33:00Z,Ewallet,465.76,4.761904762,23.288,8.4,27,1,2019
373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2019-02-08,2024-03-17T10:37:00Z,Ewallet,604.17,4.761904762,30.2085,5.3,8,2,2019
699-14-3026,C,Naypyitaw,Normal,Male,Electronic accessories,85.39,7,29.8865,627.6165,2019-03-25,2024-03-17T18:30:00Z,Ewallet,597.73,4.761904762,29.8865,4.1,25,3,2019
355-53-5943,A,Yangon,Member,Female,Electronic accessories,68.84,6,20.652,433.692,2019-02-25,2024-03-17T14:36:00Z,Ewallet,413.04,4.761904762,20.652,5.8,25,2,2019
315-22-5665,C,Naypyitaw,Normal,Female,Home and lifestyle,73.56,10,36.78,772.38,2019-02-24,2024-03-17T11:38:00Z,Ewallet,735.6,4.761904762,36.78,8.0,24,2,2019
665-32-9167,A,Yangon,Member,Female,Health and beauty,36.26,2,3.626,76.146,2019-01-10,2024-03-17T17:15:00Z,Credit card,72.52,4.761904762,3.626,7.2,10,1,2019
692-92-5582,B,Mandalay,Member,Female,Food and beverages,54.84,3,8.226,172.746,2019-02-20,2024-03-17T13:27:00Z,Credit card,164.52,4.761904762,8.226,5.9,20,2,2019


## 3. Create Data Warehouse

### 3.1. Create Dim_Customer

In [0]:
# Choose column
col_customer = ["City", "Customer_type", "Gender"]

# Select from base dataframe
dim_customer_df = walmartsales_df.select(col_customer).distinct()

# Create column ID 
dim_customer_df = dim_customer_df.withColumn("Customer_ID", F.monotonically_increasing_id() + 1) \
                                 .select(["Customer_ID", "Customer_type", "Gender", "City"])

# Display df
display(dim_customer_df)

Customer_ID,Customer_type,Gender,City
1,Member,Female,Mandalay
2,Normal,Female,Naypyitaw
3,Normal,Female,Yangon
4,Member,Male,Mandalay
5,Normal,Male,Naypyitaw
6,Normal,Male,Mandalay
7,Normal,Female,Mandalay
8,Member,Male,Naypyitaw
9,Normal,Male,Yangon
10,Member,Female,Yangon


### 3.2. Create Dim_Product

In [0]:
# Choose column
col_product = ["Product_line"]

# Select from OG dataframe
dim_product_df = walmartsales_df.select(col_product).distinct()

# Create new column ID
dim_product_df = dim_product_df.withColumn("Product_ID", F.monotonically_increasing_id() + 1) \
                               .select(["Product_ID", "Product_line"])

# Display df
display(dim_product_df)

Product_ID,Product_line
1,Home and lifestyle
2,Fashion accessories
3,Health and beauty
4,Electronic accessories
5,Food and beverages
6,Sports and travel


### 3.3. Create Dim_DateTime

In [0]:
# Choose column for dim_time
col_datetime = ["Date", "Time", "Day", "Month", "Year"]

# Select from OG df
dim_datetime_df = walmartsales_df.select(col_datetime).distinct()

# Create new column ID
dim_datetime_df = dim_datetime_df.withColumn("DateTime_ID", F.monotonically_increasing_id() + 1) \
                             .select(["DateTime_ID", "Date", "Day", "Month", "Year", "Time"])

# Display df
display(dim_datetime_df)

DateTime_ID,Date,Day,Month,Year,Time
1,2019-02-14,14,2,2019,2024-03-17T14:35:00Z
2,2019-03-28,28,3,2019,2024-03-17T16:44:00Z
3,2019-01-30,30,1,2019,2024-03-17T20:23:00Z
4,2019-01-20,20,1,2019,2024-03-17T15:55:00Z
5,2019-02-02,2,2,2019,2024-03-17T18:50:00Z
6,2019-01-25,25,1,2019,2024-03-17T10:11:00Z
7,2019-01-23,23,1,2019,2024-03-17T11:22:00Z
8,2019-03-04,4,3,2019,2024-03-17T12:44:00Z
9,2019-02-08,8,2,2019,2024-03-17T11:39:00Z
10,2019-03-12,12,3,2019,2024-03-17T20:36:00Z


### 3.4. Create Fact tables

In [0]:
walmartsales_df.count()

1000

In [0]:
fact_walmartsales = walmartsales_df.join(dim_customer_df, on = col_customer, how = "inner") \
                                   .join(dim_product_df, on = col_product, how = "inner") \
                                   .join(dim_datetime_df, on = col_datetime, how = "inner") \
                                   .select(["Invoice_ID", "Customer_ID", "Product_ID", "DateTime_ID", 
                                            "Branch", "Unit_price", "Quantity", "Tax_5%", "Total",
                                            "Payment", "Cogs", "gross_margin_percentage", "gross_income", 
                                            "Rating"])

In [0]:
fact_walmartsales.count()

1000

### 3.5 Create Database and Tables

In [0]:
%sql
CREATE DATABASE WalmartSales_DW;
USE WalmartSales_DW;

In [0]:
dim_customer_df.write.mode("overwrite").saveAsTable("WalmartSales_DW.dim_customer")
dim_product_df.write.mode("overwrite").saveAsTable("WalmartSales_DW.dim_product")
dim_datetime_df.write.mode("overwrite").saveAsTable("WalmartSales_DW.dim_datetime")
fact_walmartsales.write.mode("overwrite").saveAsTable("WalmartSales_DW.fact_walmartsales")