In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import lit

In [0]:
%fs
ls /Volumes/workspace/ecommerce/ecommerce_data

path,name,size,modificationTime
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv,2019-Nov.csv,9006762395,1767962633000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv,2019-Oct.csv,5668612855,1767962553000
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/delta/,delta/,0,1768641313682
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/incremental/,incremental/,0,1768641313682
dbfs:/Volumes/workspace/ecommerce/ecommerce_data/parquet/,parquet/,0,1768641313682


In [0]:
df_oct = spark.read.parquet("/Volumes/workspace/ecommerce/ecommerce_data/parquet/oct/")
df_oct.limit(5).show()

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-13 06:25:46|      view|   1002544|2053013555631882655|electronics.smart...|   apple| 460.51|518958788|e7e27c5c-1e78-481...|
|2019-10-13 06:25:46|      view|   3700301|2053013565983425517|appliances.enviro...|   vitek| 120.93|557977070|7afc206c-7259-4be...|
|2019-10-13 06:25:46|      view|  49100004|2127425375913902544|                NULL|    NULL|  45.05|514456508|9d6837a5-40df-49d...|
|2019-10-13 06:25:46|      view|   9200409|2053013552913973497|computers.periphe...|defender|  12.56|512530774|df2d048d-c1ae-41b...|
|2019-10-13 06:25:46|      view|   1306558|2053013558920217191|  comp

In [0]:
%sql
WITH daily AS (
  SELECT
    DATE(event_time) AS event_date,
    SUM(price) AS rev
  FROM ecommerce.silver.events
  WHERE event_type = 'purchase'
  GROUP BY DATE(event_time)
)
SELECT
  event_date,
  rev,
  AVG(rev) OVER (
    ORDER BY event_date
    ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
  ) AS ma7
FROM daily
ORDER BY event_date;

event_date,rev,ma7
2019-10-01,6275579.060000005,6275579.060000005
2019-10-02,6213628.530000009,6244603.795000007
2019-10-03,6233782.980000013,6240996.856666676
2019-10-04,8623058.190000024,6836512.1900000125
2019-10-05,7341094.460000007,6937428.644000012
2019-10-06,6736895.74000001,6904006.493333344
2019-10-07,6347448.09,6824498.150000011
2019-10-08,6819701.260000012,6902229.892857155
2019-10-09,6855193.409999998,6993882.018571437
2019-10-10,6665413.209999978,7055543.480000004


In [0]:
%sql
SELECT
  category_code,
  SUM(CASE WHEN event_type = 'view' THEN 1 ELSE 0 END) AS views,
  SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) AS purchases,
  ROUND(
    SUM(CASE WHEN event_type = 'purchase' THEN 1 ELSE 0 END) * 100.0 /
    NULLIF(SUM(CASE WHEN event_type = 'view' THEN 1 ELSE 0 END), 0),
    2
  ) AS conversion_rate
FROM ecommerce.silver.events
GROUP BY category_code
ORDER BY conversion_rate DESC;


category_code,views,purchases,conversion_rate
electronics.smartphone,10617170,337978,3.18
kids.fmcg.diapers,24201,768,3.17
electronics.audio.headphone,1018303,30501,3.0
appliances.iron,157611,3652,2.32
appliances.kitchen.microwave,164925,3708,2.25
medicine.tools.tonometer,13970,310,2.22
appliances.personal.scales,62603,1331,2.13
electronics.video.tv,1055799,21561,2.04
appliances.environment.water_heater,138762,2774,2.0
appliances.ironing_board,34263,664,1.94


In [0]:
%sql
WITH customer_metrics AS (
  SELECT
    user_id,
    COUNT(*) AS cnt,
    SUM(price) AS total_spent
  FROM ecommerce.silver.events
  WHERE event_type = 'purchase'
  GROUP BY user_id
)
SELECT
  CASE
    WHEN cnt >= 10 THEN 'VIP'
    WHEN cnt >= 5 THEN 'Loyal'
    ELSE 'Regular'
  END AS tier,
  COUNT(*) AS customers,
  ROUND(AVG(total_spent), 2) AS avg_ltv
FROM customer_metrics
GROUP BY tier
ORDER BY avg_ltv DESC;


tier,customers,avg_ltv
VIP,7781,7230.81
Loyal,19114,2109.34
Regular,320223,416.43
