In [0]:
#Load the Dataset
from pyspark.sql import functions as F
from pyspark.sql.window import Window

path = "/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv"
df = spark.read.csv(path, header = True, inferSchema = True)
df.printSchema() #print the structure of the dataset

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)



In [0]:
#sample data
df.show(5)

+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand|  price|  user_id|        user_session|
+-------------------+----------+----------+-------------------+--------------------+--------+-------+---------+--------------------+
|2019-10-01 00:00:00|      view|  44600062|2103807459595387724|                NULL|shiseido|  35.79|541312140|72d76fde-8bb3-4e0...|
|2019-10-01 00:00:00|      view|   3900821|2053013552326770905|appliances.enviro...|    aqua|   33.2|554748717|9333dfbd-b87a-470...|
|2019-10-01 00:00:01|      view|  17200506|2053013559792632471|furniture.living_...|    NULL|  543.1|519107250|566511c2-e2e3-422...|
|2019-10-01 00:00:01|      view|   1307067|2053013558920217191|  computers.notebook|  lenovo| 251.74|550050854|7c90fc70-0e80-459...|
|2019-10-01 00:00:04|      view|   1004237|2053013555631882655|electr

In [0]:
df.count()    #size check

42448764

In [0]:
df.columns()

['event_time',
 'event_type',
 'product_id',
 'category_id',
 'category_code',
 'brand',
 'price',
 'user_id',
 'user_session']

In [0]:
#Count NULLS per column
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(),1)).alias(c)
    for c in df.columns
])
null_counts.show()

+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|event_time|event_type|product_id|category_id|category_code|  brand|price|user_id|user_session|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+
|         0|         0|         0|          0|     13515609|6113008|    0|      0|           2|
+----------+----------+----------+-----------+-------------+-------+-----+-------+------------+



In [0]:
#Sumamry Statistics for numerical columns
df.describe().show()

+-------+----------+--------------------+--------------------+-------------------+--------+-----------------+-------------------+--------------------+
|summary|event_type|          product_id|         category_id|      category_code|   brand|            price|            user_id|        user_session|
+-------+----------+--------------------+--------------------+-------------------+--------+-----------------+-------------------+--------------------+
|  count|  42448764|            42448764|            42448764|           28933155|36335756|         42448764|           42448764|            42448762|
|   mean|      NULL|1.0549932375842676E7|2.057404237936260...|               NULL|     NaN|290.3236606848809|5.335371475081686E8|                NULL|
| stddev|      NULL|1.1881906970608113E7|1.843926466140415E16|               NULL|     NaN|358.2691553394021|1.852373817465414E7|                NULL|
|    min|      cart|             1000978| 2053013552226107603|    accessories.bag|  a-case|   

In [0]:
#Summary Statistics for Numeric Column
from pyspark.sql.types import NumericType
numeric_cols = [c[0] for c in df.dtypes if c[1] in ['int', 'bigint', 'float','double']]
print(numeric_cols)

['product_id', 'category_id', 'price', 'user_id']


In [0]:
#Categorical = String columns
categorical_cols = [c[0] for c in df.dtypes if c[1] == 'string']
print(categorical_cols)

['event_type', 'category_code', 'brand', 'user_session']


In [0]:
#Unique counts / cardinality for categorical columns
for col in categorical_cols:
    unique_categorical_count = df.select(col).distinct().count()
    print(f"Unique values for {col}: {unique_categorical_count}")


Unique values for event_type: 3
Unique values for category_code: 127
Unique values for brand: 3446
Unique values for user_session: 9244422


In [0]:
#Top 5 most frequent values for categorical columns
for col in categorical_cols:
    top_5 = df.groupBy(col).count().orderBy(F.desc("count")).show(5)

+----------+--------+
|event_type|   count|
+----------+--------+
|      view|40779399|
|      cart|  926516|
|  purchase|  742849|
+----------+--------+

+--------------------+--------+
|       category_code|   count|
+--------------------+--------+
|                NULL|13515609|
|electronics.smart...|11507231|
|  electronics.clocks| 1311033|
|  computers.notebook| 1137623|
|electronics.video.tv| 1113750|
+--------------------+--------+
only showing top 5 rows
+-------+-------+
|  brand|  count|
+-------+-------+
|   NULL|6113008|
|samsung|5282775|
|  apple|4122554|
| xiaomi|3083763|
| huawei|1111205|
+-------+-------+
only showing top 5 rows
+--------------------+-----+
|        user_session|count|
+--------------------+-----+
|fb075266-182d-4c1...| 1159|
|cfb90a35-9575-495...| 1137|
|2183f046-46f1-4ff...|  584|
|b2101293-44c1-481...|  564|
|e9d2b8ad-3e69-47f...|  425|
+--------------------+-----+
only showing top 5 rows


In [0]:
#JOINS:

#Find user who viewed the product before purchasing it

#Step 1: Filter Views
views = (
    df
    .filter(F.col("event_type") == "view")
    .select("user_id","product_id", "event_time")
    )
#Step 2: Filter Purchases
purchases = (
    df
    .filter(F.col("event_type") == "purchase")
    .select("user_id", "product_id", "event_time")
)
#Step 3: Joins view amd purchases, keep only view that happened before purchase
views_before_purchase = (
    purchases
    .join(
        views,
        on = ["user_id", "product_id"],
        how = "inner"
    )
    .filter(views.event_time < purchases.event_time)
)

#Show first 5 results
views_before_purchase.show(5)


+---------+----------+-------------------+-------------------+
|  user_id|product_id|         event_time|         event_time|
+---------+----------+-------------------+-------------------+
|538336842|   3600898|2019-10-04 16:04:44|2019-10-01 03:05:56|
|538336842|   3600898|2019-10-04 16:04:44|2019-10-01 03:06:13|
|538336842|   3600898|2019-10-04 16:04:44|2019-10-01 03:06:19|
|546892715|  15100008|2019-10-01 05:35:30|2019-10-01 05:32:50|
|546892715|  15100008|2019-10-01 05:35:30|2019-10-01 05:33:38|
+---------+----------+-------------------+-------------------+
only showing top 5 rows


In [0]:
#Windows Functions: Running Totals / Rankings

#Running total of events / user

#Define the widnow
user_window = Window.partitionBy("user_id").orderBy("event_time")

#Add a running total column
df_running = df.withColumn(
    "Cumulative Events",
    F.count("*").over(user_window)
)
df_running.show(5)


+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+-----------------+
|         event_time|event_type|product_id|        category_id|       category_code|   brand| price|  user_id|        user_session|Cumulative Events|
+-------------------+----------+----------+-------------------+--------------------+--------+------+---------+--------------------+-----------------+
|2019-10-09 10:30:19|      view|  17301541|2053013553853497655|                NULL|    NULL|162.17|205053188|e1eadbc6-aef5-4cf...|                1|
|2019-10-09 10:30:44|      view|  17301541|2053013553853497655|                NULL|    NULL|162.17|205053188|e1eadbc6-aef5-4cf...|                2|
|2019-10-07 06:23:01|      view|  16200119|2053013556344914381|   kids.fmcg.diapers|   moony| 18.47|222907508|cb653adc-46a2-4d9...|                1|
|2019-10-07 06:26:23|      view|  16200162|2053013556344914381|   kids.fmcg.diapers|   moony| 18.47|

In [0]:
#Running top products by revenue

# Filter only purchases
purchases = df.filter(F.col("event_type") == "purchase")

# Calculate revenue per product
revenue = (
    purchases.filter(F.col("category_code").isNotNull())
    .groupBy("product_id", "category_code")
    .agg(F.sum("price").alias("revenue"))
)

# Define ranking window
rank_window = Window.partitionBy("category_code").orderBy(F.desc("revenue"))

# Add rank
top_products = revenue.withColumn("rank", F.rank().over(rank_window))

top_products.show(10)


+----------+---------------+------------------+----+
|product_id|  category_code|           revenue|rank|
+----------+---------------+------------------+----+
|  28401058|accessories.bag|1505.8500000000001|   1|
|  21000006|accessories.bag|           1194.28|   2|
|  28400775|accessories.bag| 861.2799999999999|   3|
|  28401252|accessories.bag|            686.26|   4|
|  18300846|accessories.bag|             615.2|   5|
|  18300155|accessories.bag| 605.7699999999998|   6|
|  28401176|accessories.bag|            602.34|   7|
|  28401045|accessories.bag| 571.4399999999999|   8|
|  18300713|accessories.bag|            571.26|   9|
|  28400759|accessories.bag|            547.82|  10|
+----------+---------------+------------------+----+
only showing top 10 rows


In [0]:
#Conversion Rate by category:
conversion = (
    df.groupBy("category_code", "event_type").count()
    .groupBy("category_code")
    .pivot("event_type")
    .sum("count")
    .withColumn("conversion_rate", F.col("purchase") / F.col("view") * 100)
)

conversion.show(5)


+--------------------+----+--------+------+------------------+
|       category_code|cart|purchase|  view|   conversion_rate|
+--------------------+----+--------+------+------------------+
|auto.accessories....|NULL|      46| 12305|0.3738317757009346|
|furniture.living_...|NULL|    1084|215471|0.5030839416905292|
| stationery.cartrige| 106|     134|  7380|1.8157181571815717|
|       sport.bicycle| 693|     838|128759|0.6508282916145668|
|        apparel.sock|   7|      21|  2621|0.8012209080503624|
+--------------------+----+--------+------+------------------+
only showing top 5 rows
