#### Step 0: Prepare Combined Dataset

In [0]:
from pyspark.sql import functions as F

# Load Silver Oct dataset
df_oct = spark.table("silver_events_oct")

# Load Silver Nov dataset
df_nov = spark.table("silver_df_nov_realworld")

# Add missing ingestion_date to Nov dataset
df_nov = df_nov.withColumn("ingestion_date", F.to_date(F.col("ingestion_ts")))

# Union datasets 
events = df_oct.unionByName(df_nov, allowMissingColumns=True)

#Verify
events.printSchema()
events.show(5)

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- category_code: string (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: double (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- ingestion_ts: timestamp (nullable = true)
 |-- ingestion_date: date (nullable = true)
 |-- event_date: date (nullable = true)
 |-- price_tier: string (nullable = true)

+-------------------+----------+----------+-------------------+--------------------+-------+------+---------+--------------------+--------------------+--------------+----------+----------+
|         event_time|event_type|product_id|        category_id|       category_code|  brand| price|  user_id|        user_session|        ingestion_ts|ingestion_date|event_date|price_tier|
+-------------------+----------+----------+-------------------+------

#### Step 1: Calculate Statistical Summaries (Descriptive Stats)

In [0]:
#Basic descriptive tasks
events.select("price").describe().show()

#Median and Quartiles
quantiles = events.approxQuantile("price", [0.25, 0.5, 0.75], 0.01)
print(f"25%: {quantiles[0]}, Median: {quantiles[1]}, 75%: {quantiles[2]}")

+-------+-----------------+
|summary|            price|
+-------+-----------------+
|  count|        109585110|
|   mean|292.1580414624276|
| stddev|356.8470302075258|
|    min|              0.0|
|    max|          2574.07|
+-------+-----------------+

25%: 67.87, Median: 164.48, 75%: 360.09


#### Step 2: Hypothesis Testing (Weekday vs Weekend)

Null Hypothesis (H₀):
Conversion rate is the same on weekdays and weekends.

Alternative Hypothesis (H₁):
Conversion rate differs between weekdays and weekends.

In [0]:
#1. Add a flag for weekend:
from pyspark.sql.window import Window

events = events.withColumn(
  "is_weekend",
  F.dayofweek("event_date").isin([1,7])
)

#2. Group by is_weekend and event_type to see views vs purchases:
events.groupBy("is_weekend", "event_type") \
      .count() \
      .show()

#3. compute conversion rate for weekend vs weekday:
conversion = events.groupBy("is_weekend").agg(
    (F.sum(F.when(F.col("event_type")=="purchase",1).otherwise(0)) /
     F.sum(F.when(F.col("event_type")=="view",1).otherwise(0)) * 100
    ).alias("conversion_rate")
)

conversion.show()


+----------+----------+--------+
|is_weekend|event_type|   count|
+----------+----------+--------+
|     false|  purchase| 1046615|
|     false|      view|70100259|
|      true|      view|34023656|
|     false|      cart| 2371140|
|      true|      cart| 1430365|
|      true|  purchase|  613075|
+----------+----------+--------+

+----------+------------------+
|is_weekend|   conversion_rate|
+----------+------------------+
|      true|1.8019080606740203|
|     false|1.4930258674222587|
+----------+------------------+



Reject the null hypothesis
Conversion rate is higher on weekends.

#### Step 3: Identify Correlations


In [0]:
#Step 3: Identify Correlations
#First, create a numeric metric column if needed
from pyspark.sql import functions as F

# Correlation between price and conversion rate (example: product-level)
df_product = events.groupBy("product_id").agg(
  F.avg("price").alias("avg_price"),
  F.sum(F.when(F.col("event_type") == "purchase", 1).otherwise(0)).alias("purchases"),
  F.sum(F.when(F.col("event_type") == "view", 1).otherwise(0)).alias("views") 
).withColumn(
  "conversion_rate",
  F.when(F.col("views") > 0,
         (F.col("purchases") / F.col("views")) * 100
         ).otherwise(None)
)

df_product.show()

corr = df_product.stat.corr("avg_price", "conversion_rate")
print(f"Correlation between avg_price and conversion_rate: {corr}")

+----------+------------------+---------+------+-------------------+
|product_id|         avg_price|purchases| views|    conversion_rate|
+----------+------------------+---------+------+-------------------+
|  23301548|30.630000000000003|        3|   345| 0.8695652173913043|
|   1005159|209.88282712913957|     3663|164322| 2.2291598203527223|
|   6902812| 81.21988623435722|        8|   855| 0.9356725146198831|
|  11100315| 7.700000000000004|       24|  2987| 0.8034817542684968|
|   4801567| 84.74563245823393|        2|   414| 0.4830917874396135|
|   7004004|128.67985849056606|       23|  1646| 1.3973268529769136|
|   5701087| 52.81682803682785|      100|  6591| 1.5172204521316948|
|  17300014|43.774654002713746|       26|  1388| 1.8731988472622478|
|  27300009|182.53178861788618|        7|  1081| 0.6475485661424607|
|  26404407| 97.18602779108772|       25|  2052| 1.2183235867446394|
|   4100258| 375.9613807531383|       31|  1838| 1.6866158868335146|
|   8500290| 264.4663780120483|   


The correlation between average price and conversion rate is weakly negative, 
indicating that price alone does not strongly influence purchase probability. 
Conversion is driven more by factors like brand, category, and user intent.

#### Step 4: Feature Engineering for ML

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# Step 4.1: Remove invalid prices
events_fe = events.filter(F.col("price") > 0)

# Step 4.2: Log price
events_fe = events_fe.withColumn("price_log", F.log(F.col("price")))

# Step 4.3: Time-based features
events_fe = events_fe.withColumn("hour", F.hour("event_time"))
events_fe = events_fe.withColumn("day_of_week", F.dayofweek("event_date"))
events_fe = events_fe.withColumn("is_weekend", F.dayofweek("event_date").isin([1,7]).cast("int"))

# Step 4.4: User behavioral feature
user_window = Window.partitionBy("user_id").orderBy("event_time")
events_fe = events_fe.withColumn(
    "time_since_first_event",
    F.col("event_time").cast("long") - F.first(F.col("event_time")).over(user_window).cast("long")
)

# Step 4.5: Binary label
events_fe = events_fe.withColumn("label", F.when(F.col("event_type") == "purchase", 1).otherwise(0))

# Step 4.6: Select final ML features
ml_features = events_fe.select(
    "price_log", "hour", "day_of_week", "is_weekend", "time_since_first_event", "label"
)

ml_features.show(5)
ml_features.printSchema()


+------------------+----+-----------+----------+----------------------+-----+
|         price_log|hour|day_of_week|is_weekend|time_since_first_event|label|
+------------------+----+-----------+----------+----------------------+-----+
| 4.194943760778217|   7|          6|         0|                     0|    0|
| 4.191773707553646|  14|          5|         0|               1146401|    0|
|2.9734866646066713|   5|          3|         0|                     0|    0|
| 5.072482697422336|   8|          5|         0|                     0|    0|
| 5.498192290922892|  15|          2|         0|                     0|    0|
+------------------+----+-----------+----------+----------------------+-----+
only showing top 5 rows
root
 |-- price_log: double (nullable = true)
 |-- hour: integer (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- is_weekend: integer (nullable = true)
 |-- time_since_first_event: long (nullable = true)
 |-- label: integer (nullable = false)

