In [0]:
# Define your storage account name and key
storage_account_name = "retailanalyticsstaging"
storage_account_key = "****"

# Define the container name and mount point
container_name = "staging-data"
mount_point = "/mnt/staging"

# Mount the storage account
dbutils.fs.mount(
    source = f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
    mount_point = mount_point,
    extra_configs = {f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net": storage_account_key}
)

# List the files in the mounted directory to verify
display(dbutils.fs.ls(mount_point))

path,name,size,modificationTime
dbfs:/mnt/staging/part-merged.csv,part-merged.csv,10574531,1719657496000


In [0]:
df = spark.read.csv("/mnt/staging/part-merged.csv")

In [None]:
df.display()

In [0]:
%sql
-- create database in hive metastore catalog
CREATE DATABASE IF NOT EXISTS olistdatabase;

In [0]:
%sql
CREATE TABLE IF NOT EXISTS olistdatabase.olisttable
(
id STRING,
order_status STRING,
order_product_value DOUBLE,
order_freight_charge DOUBLE,
order_product_qty INT,
customer_city STRING,
customer_state STRING,
customer_zipcode_prefix INT,
product_name_len INT,
product_description_len INT,
product_photos_qty INT,
product_review_score INT,
order_placed_at TIMESTAMP,
order_approved_at TIMESTAMP,
order_delivered_at TIMESTAMP
)
USING CSV
OPTIONS
(
header='false',
timestampFormat='dd-MM-yyyy HH:mm'
);

In [0]:
%sql
INSERT OVERWRITE olistdatabase.olisttable
SELECT 
    _c0 AS id,
    _c1 AS order_status,
    CAST(_c2 AS DOUBLE) AS order_product_value,
    CAST(_c3 AS DOUBLE) AS order_freight_charge,
    CAST(_c4 AS DOUBLE) AS order_product_qty,
    _c5 AS customer_city,
    _c6 AS customer_state,
    CAST(_c7 AS INT) AS customer_zipcode_prefix,
    CAST(_c8 AS INT) AS product_name_len,
    CAST(_c9 AS INT) AS product_description_len,
    CAST(_c10 AS INT) AS product_photos_qty,
    CAST(_c11 AS INT) AS product_review_score,
    to_timestamp(_c12, 'dd-MM-yyyy HH:mm') AS order_placed_at,
    to_timestamp(_c13, 'dd-MM-yyyy HH:mm') AS order_approved_at,
    to_timestamp(_c14, 'dd-MM-yyyy HH:mm') AS order_delivered_at
FROM csv.`/mnt/staging/part-merged.csv`;

In [0]:
%sql
select * from olistdatabase.olisttable limit 5

id,order_status,order_product_value,order_freight_charge,order_product_qty,customer_city,customer_state,customer_zipcode_prefix,product_name_len,product_description_len,product_photos_qty,product_review_score,order_placed_at,order_approved_at,order_delivered_at
1,delivered,79.0,17.8,1,Luziania,GO,728,50,201,2,5,2017-10-02T10:56:00Z,2017-10-02T11:07:00Z,2017-10-10T21:25:00Z
2,delivered,119.9,27.16,1,Joinville,SC,892,50,511,3,5,2018-07-24T20:41:00Z,2018-07-26T03:24:00Z,2018-08-07T15:27:00Z
3,delivered,519.99,41.69,1,Serra,ES,291,48,1156,2,1,2018-08-08T08:38:00Z,2018-08-08T08:55:00Z,2018-08-17T18:06:00Z
4,delivered,29.5,17.92,1,RIO DE JANEIRO,RJ,222,21,207,2,4,2017-11-18T19:28:00Z,2017-11-18T19:45:00Z,2017-12-02T00:28:00Z
5,delivered,26.77,23.11,1,Sao Paulo,SP,40,41,451,1,5,2018-02-13T21:18:00Z,2018-02-13T22:20:00Z,2018-02-16T18:17:00Z


In [None]:
## Step15: Read the Hive table from the Hive database into a Spark DataFrame

df = spark.table("olistdatabase.olisttable")
df.display(); 
df.printSchema()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from functools import reduce

In [None]:
### Step16: Calculate the order metrics

df = df.withColumn("order_sales", col("order_product_value") * col("order_product_qty"))
df = df.withColumn("order_approval_time", col("order_approved_at") - col("order_placed_at"))
df = df.withColumn("order_delivery_time", col("order_delivered_at") - col("order_placed_at"))
df = df.withColumn("order_placed_date", F.to_date(col("order_placed_at")))
df = df.withColumn("order_placed_weekdate", F.trunc(col("order_placed_at"), "week"))
df.display();

In [0]:
# Step17:Compute the historical daily insights (We will need this values in further steps.)

grp_date = df.groupBy(col("order_placed_date"))
grp_city_date = df.groupBy(col("customer_city"), col("order_placed_date"))
grp_state_date = df.groupBy(col("customer_state"), col("order_placed_date"))

In [0]:
## Step18:

## Total sales
grp_date_total_sales = grp_date.agg(F.sum("order_sales").alias("total_sales")).orderBy(col("order_placed_date").asc())

## Total sales per city
grp_city_date_total_sales = grp_city_date.agg(F.sum("order_sales").alias("total_sales")).orderBy(col("customer_city").asc(),
col("order_placed_date").asc())

##Total sales per state
grp_state_date_total_sales = grp_state_date.agg(F.sum("order_sales").alias("total_sales")).orderBy(col("customer_state").asc(),
col("order_placed_date").asc())

In [0]:
## Step19:

#Total Freight Charge
#i. Total freight charge
#ii. Total freight charge per city
#iii. Total freight charge per state
## Total freight charge
grp_date_total_freight_charge = grp_date.agg(F.sum("order_freight_charge").alias("total_freight_charge")).orderBy(col("order_placed_date").asc())

## Total freight charge per city.
grp_city_date_total_freight_charge = grp_city_date.agg(F.sum("order_freight_charge").alias("total_freight_charge")).orderBy(col("customer_city").asc(),col("order_placed_date").asc())

## Total freight charge per state.
grp_state_date_total_freight_charge = grp_state_date.agg(F.sum("order_freight_charge").alias("total_freight_charge")).orderBy(col("customer_state").asc(),col("order_placed_date").asc())

In [0]:
## Step20:

# Total Order Count
# i. Total order count
# ii. Total order count per city
# iii. Total order count per state

## Total order count
grp_date_total_order_count = grp_date.agg(F.count("id").alias("total_order_count")).orderBy(col("order_placed_date").asc())

## Total order count per city
grp_city_date_total_order_count = grp_city_date.agg(F.count("id").alias("total_order_count")).orderBy(col("customer_city").asc(),
col("order_placed_date").asc())

## Total order count per state
grp_state_date_total_order_count = grp_state_date.agg(F.count("id").alias("total_order_count")).orderBy(col("customer_state").asc(),
col("order_placed_date").asc())

In [0]:
# Step21: Average Freight charge

## Total average freight charge.
grp_date_avg_freight_charge = grp_date.agg(F.avg("order_freight_charge").alias("avg_freight_charge")).orderBy(col("order_placed_date").asc())

## Total order freight charge per city
grp_city_date_vg_freight_charge = grp_city_date.agg(F.avg("order_freight_charge").alias("avg_freight_charge")).orderBy(col("customer_city").asc(),col("order_placed_date").asc())

## Total order freight charge per state
grp_state_date_vg_freight_charge = grp_state_date.agg(F.avg("order_freight_charge").alias("avg_freight_charge")).orderBy(col("customer_state").asc(),col("order_placed_date").asc())


In [0]:
# Step22: Average Review score

# Total average review charge.
grp_date_avg_review_score = grp_date.agg(F.avg("product_review_score").alias("avg_review_score")).orderBy(col("order_placed_date")
.asc())

## Total order average review per city
grp_city_date_avg_review_score = grp_city_date.agg(F.avg("product_review_score").alias("avg_review_score")).orderBy(col("customer_city").asc(),col("order_placed_date").asc())

## Total order average review per state
grp_state_date_avg_review_score = grp_state_date.agg(F.avg("product_review_score").alias("avg_review_score")).orderBy(col("customer_state").asc(),col("order_placed_date").asc())

In [0]:
# Step23: Average Approval Time

grp_date_avg_approval_time = grp_date.agg(F.avg("order_approval_time").alias("avg_approval_time")).orderBy(col("order_placed_date")
.asc())	

## Average Approval Time per city
grp_city_date_avg_approval_time = grp_city_date.agg(F.avg("order_approval_time").alias("avg_approval_time")).orderBy(col("customer_city").asc(),col("order_placed_date").asc())

## Average Approval Time per state
grp_state_date_avg_approval_time = grp_state_date.agg(F.avg("order_approval_time").alias("avg_approval_time")).orderBy(col("customer_state").asc(),col("order_placed_date").asc())

In [0]:
# Step24: Average Delivery Time

grp_date_avg_delivery_time = grp_date.agg(F.avg("order_delivery_time").alias("avg_delivery_time")).orderBy(col("order_placed_date").asc())

## Average Approval Time per city
grp_city_date_avg_delivery_time = grp_city_date.agg(F.avg("order_delivery_time").alias("avg_delivery_time")).orderBy(col("customer_city").asc(),col("order_placed_date").asc())

## Average Approval Time per state
grp_state_date_avg_delivery_time = grp_state_date.agg(F.avg("order_delivery_time").alias("avg_delivery_time")).orderBy(col("customer_state").asc(),col("order_placed_date").asc())

In [0]:

## ---------------------- INSIGHTS TO BE STORED ------------------
# historical daily insights into 3 tables

#i. Insights per period
#ii. Insights per period per city
#ii. Insights per period per state

## Per period
grp_date_insights = reduce(
lambda x, y: x.join(y, on=["order_placed_date"], how="left"),
[
grp_date_total_sales,
grp_date_total_freight_charge,
grp_date_total_order_count,
grp_date_avg_freight_charge,
grp_date_avg_review_score,
grp_date_avg_approval_time,
grp_date_avg_delivery_time,
]
)

## Per city
grp_city_date_insights = reduce(
lambda x, y: x.join(y, on=["customer_city", "order_placed_date"], how="left"),
[
grp_city_date_total_sales,
grp_city_date_total_freight_charge,
grp_city_date_total_order_count,
grp_city_date_vg_freight_charge,
grp_city_date_avg_review_score,
grp_city_date_avg_approval_time,
grp_city_date_avg_delivery_time,
]
)

## Per State
grp_state_date_insights = reduce(
lambda x, y: x.join(y, on=["customer_state", "order_placed_date"], how="left"),
[
grp_state_date_total_sales,
grp_state_date_total_freight_charge,
grp_state_date_total_order_count,
grp_state_date_vg_freight_charge,
grp_state_date_avg_review_score,
grp_state_date_avg_approval_time,
grp_state_date_avg_delivery_time,
]
)

In [0]:
# Step27: Write the insights as a CSV file into the file system [DBFS]

# write pyspark dataframe as csv file into file system (dbfs -> Databricks file system assocaited with every dbricks workspace.)
grp_date_insights.write.csv("dbfs:/FileStore/shared_uploads/odl_user_1393560@simplilearnss.onmicrosoft.com/grpdateinsights", mode="overwrite", header=True)
grp_city_date_insights.write.csv("dbfs:/FileStore/tables/grpcitydateinsights", mode="overwrite", header=True)
grp_state_date_insights.write.csv("dbfs:/FileStore/tables/grpstatedateinsights", mode="overwrite", header=True)

In [0]:
# Step 28 : ADLS Blob Object storage
# write pyspark dataframe as csv file into storage account (Blob Storage Azure/AWS protocol)
# Set the Azure storage account access key
spark.conf.set(
    "fs.azure.account.key.retailanalyticsstaging.dfs.core.windows.net",
    "****"
)

grp_date_insights.write.csv(
    "abfss://staging-data@retailanalyticsstaging.dfs.core.windows.net/insights/grpdateinsights.csv",
    mode="overwrite", header=True
)
grp_city_date_insights.write.csv(
    "abfss://staging-data@retailanalyticsstaging.dfs.core.windows.net/insights/grpcitydateinsights.csv",
    mode="overwrite", header=True
)
grp_state_date_insights.write.csv(
    "abfss://staging-data@retailanalyticsstaging.dfs.core.windows.net/insights/grpstatedateinsights.csv",
    mode="overwrite", header=True
)