In [0]:
table_name = "databricks_simulated_retail_customer_data.v01.sales_orders"
orders_df = spark.read.table(table_name)
display(orders_df.limit(2))
orders_df.printSchema()

In [0]:
from pyspark.sql.functions import split, explode, col

exploded_df = orders_df.withColumn("product_name", explode(split(col("ordered_products"), ",")))

popular_products = exploded_df.groupBy("product_name").count().orderBy(col("count").desc()).limit(10)

display(popular_products)

In [0]:
from pyspark.sql.functions import from_json, schema_of_json, explode, col

# 1. define schema with example data
json_schema = "array<struct<curr:string,id:string,name:string,price:string,promotion_info:string,qty:string,unit:string>>"

# 2. 문자열을 실제 JSON 객체로 변환 (from_json)
# 3. 리스트를 행으로 펼치기 (explode)
# 4. 상품명(name)과 수량(qty) 추출
product_sales_df = orders_df.withColumn("products", from_json(col("ordered_products"), json_schema)) \
                            .withColumn("product", explode(col("products"))) \
                            .select(
                                col("product.name").alias("product_name"),
                                col("product.qty").cast("int").alias("quantity")
                            )

# 5. 상품별 판매 총량(수량 합계) 계산
top_selling_products = product_sales_df.groupBy("product_name") \
                                       .sum("quantity") \
                                       .orderBy(col("sum(quantity)").desc())

display(top_selling_products.limit(10))

#### when does Cyber-shot Camera is sold the most? I am curious...

In [0]:
from pyspark.sql.functions import from_json, explode, col, hour, from_unixtime

# 1. JSON 스키마 정의 
json_schema = "array<struct<curr:string,id:string,name:string,price:string,promotion_info:string,qty:string,unit:string>>"

# 2. 데이터 가공: 시간 추출과 상품 펼치기를 동시에 진행
trend_df = orders_df.withColumn("order_ts", from_unixtime(col("order_datetime")).cast("timestamp")) \
                    .withColumn("order_hour", hour(col("order_ts"))) \
                    .withColumn("products", from_json(col("ordered_products"), json_schema)) \
                    .withColumn("product", explode(col("products"))) \
                    .select(
                        col("order_hour"),
                        col("product.name").alias("product_name"),
                        col("product.qty").cast("int").alias("quantity")
                    )

# 3. Cyber-shot 카메라만 필터링해서 시간대별로 집계
camera_trend = trend_df.filter(col("product_name").contains("Cyber-shot")) \
                       .groupBy("order_hour") \
                       .sum("quantity") \
                       .orderBy("order_hour")

display(camera_trend)

Databricks visualization. Run in Databricks to view.