### TASK 1 — Load Gold Feature Table and Inspect

In [0]:
# Load the Gold feature table from Day 2
import pyspark.sql.functions as F
from pyspark.sql.functions import col, when, count

CATALOG = 'ecommerce'
df = spark.table(f'{CATALOG}.gold.user_features')

print(f'Total users: {df.count():,}')
df.printSchema()
display(df.limit(5))




Total users: 5,721,593
root
 |-- user_id: long (nullable = true)
 |-- total_events: long (nullable = true)
 |-- total_sessions: long (nullable = true)
 |-- total_views: long (nullable = true)
 |-- total_cart_adds: long (nullable = true)
 |-- total_purchases: long (nullable = true)
 |-- unique_products_purchased: long (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- avg_purchase_price: double (nullable = true)
 |-- avg_price_viewed: double (nullable = true)
 |-- last_seen_at: timestamp (nullable = true)
 |-- favourite_brand: string (nullable = true)
 |-- top_viewed_category: string (nullable = true)
 |-- purchase_rate: double (nullable = true)



user_id,total_events,total_sessions,total_views,total_cart_adds,total_purchases,unique_products_purchased,total_revenue,avg_purchase_price,avg_price_viewed,last_seen_at,favourite_brand,top_viewed_category,purchase_rate
515335787,126,23,70,34,22,10,13070.5,594.1136363636364,601.761,2019-11-29T09:17:37.000Z,apple,electronics.smartphone,0.1746
547853730,37,5,32,3,2,2,368.96,184.48,185.268125,2019-11-17T13:21:50.000Z,lucente,electronics.smartphone,0.0541
551298810,32,5,21,9,2,2,1564.49,782.245,638.7880952380953,2019-11-17T13:11:04.000Z,samsung,electronics.smartphone,0.0625
533765562,405,56,381,23,1,1,32.66,32.66,317.1911023622048,2019-11-29T14:32:06.000Z,xiaomi,electronics.smartphone,0.0025
515560005,196,30,190,5,1,1,411.83,411.83,313.47852631578945,2019-11-30T11:51:49.000Z,samsung,electronics.clocks,0.0051


In [0]:
# Profile the feature table before any transformation
print('=== FEATURE TABLE SUMMARY ===')
df.describe().show(truncate=False)

# Check nulls across all columns
print('=== NULL COUNTS ===')
null_counts = df.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df.columns
])
null_counts.show(truncate=False)

=== FEATURE TABLE SUMMARY ===
+-------+--------------------+------------------+-----------------+------------------+------------------+-------------------+-------------------------+------------------+------------------+-----------------+---------------+-------------------+--------------------+
|summary|user_id             |total_events      |total_sessions   |total_views       |total_cart_adds   |total_purchases    |unique_products_purchased|total_revenue     |avg_purchase_price|avg_price_viewed |favourite_brand|top_viewed_category|purchase_rate       |
+-------+--------------------+------------------+-----------------+------------------+------------------+-------------------+-------------------------+------------------+------------------+-----------------+---------------+-------------------+--------------------+
|count  |5721593             |5721593           |5721593          |5721593           |5721593           |5721593            |5721593                  |862847            |86284

### TASK 2 — Create Binary Purchase Label (Target)
**`What We Are Doing`**

Create will_purchase = 1 if the user has total_purchases > 0, else 0. Then check the class distribution to understand how imbalanced the dataset is before we do anything about it.



In [0]:
# Create binary target label
df_labelled = df.withColumn(
    'will_purchase',
    when(col('total_purchases') > 0, 1).otherwise(0)
)
# Verify the label was created correctly
display(df_labelled.select(
    'user_id', 'total_purchases', 'total_revenue', 'will_purchase'
).limit(10))


user_id,total_purchases,total_revenue,will_purchase
515335787,22,13070.5,1
547853730,2,368.96,1
551298810,2,1564.49,1
533765562,1,32.66,1
515560005,1,411.83,1
520743527,14,4316.15,1
519036764,1,62.52,1
515660395,1,411.82,1
564043130,2,311.58,1
561091791,1,246.6,1


In [0]:
# Check class distribution (this reveals the imbalance)
print('=== CLASS DISTRIBUTION ===')
class_dist = df_labelled.groupBy('will_purchase') \
    .count() \
    .orderBy('will_purchase')
class_dist.show()

# Calculate imbalance ratio
total      = df_labelled.count()
purchasers = df_labelled.filter(col('will_purchase') == 1).count()
non_purchasers = total - purchasers

print(f'Total users:       {total:,}')
print(f'Purchasers (1):    {purchasers:,}  ({round(purchasers/total*100,2)}%)')
print(f'Non-purchasers (0):{non_purchasers:,}  ({round(non_purchasers/total*100,2)}%)')
print(f'Imbalance ratio:   {round(non_purchasers/purchasers,1)}:1')

=== CLASS DISTRIBUTION ===
+-------------+-------+
|will_purchase|  count|
+-------------+-------+
|            0|4858746|
|            1| 862847|
+-------------+-------+

Total users:       5,721,593
Purchasers (1):    862,847  (15.08%)
Non-purchasers (0):4,858,746  (84.92%)
Imbalance ratio:   5.6:1


`What to look for in the output:`
> If purchasers are < 30% of total users, you have a class imbalance problem.  
If purchasers are < 10%, the imbalance is severe and class weights or SMOTE are essential.  
Ecommerce datasets typically show 5-20% purchase rate — expect significant imbalance


### TASK 3 — Feature Selection and Remove Label Leakage
`What We Are Doing`
- Remove leaky features (those that encode the label), ID columns, and timestamp columns. Keep only features that would be available at prediction time — i.e. before the user makes a purchase decision.


In [0]:
# Define which features to keep and which to drop

# DROP: leaky features (encode the label), IDs, timestamps
DROP_COLS = [
    'user_id',                    # identifier — no predictive signal
    'total_purchases',            # directly encodes the label — LEAKY
    'total_revenue',              # zero for non-purchasers — LEAKY
    'avg_purchase_price',         # null for non-purchasers — LEAKY
    'purchase_rate',              # = purchases / total_events — LEAKY
    'unique_products_purchased',  # only non-zero for purchasers — LEAKY
    'last_seen_at',               # timestamp — not a model feature
    '_feature_date',              # pipeline metadata — not a model feature
]

# KEEP: observable pre-purchase behavioural signals
FEATURE_COLS = [
    'total_events',        # how active is this user overall?
    'total_sessions',      # how many sessions did they have?
    'total_views',         # how many products did they view?
    'total_cart_adds',     # did they add items to cart without buying?
    'avg_price_viewed',    # what price range are they browsing?
    'favourite_brand',     # brand affinity signal
    'top_viewed_category', # category preference signal
]

LABEL_COL = 'will_purchase'

print('Features to use for training:')
for f in FEATURE_COLS:
    print(f'  + {f}')
print(f'Label: {LABEL_COL}')


Features to use for training:
  + total_events
  + total_sessions
  + total_views
  + total_cart_adds
  + avg_price_viewed
  + favourite_brand
  + top_viewed_category
Label: will_purchase


In [0]:
# Build the clean ML dataset (keep only features + label)
df_ml = df_labelled.select(FEATURE_COLS + [LABEL_COL])

print(f'ML dataset shape: {df_ml.count():,} rows x {len(df_ml.columns)} columns')
df_ml.printSchema()
display(df_ml.limit(10))


ML dataset shape: 5,721,593 rows x 8 columns
root
 |-- total_events: long (nullable = true)
 |-- total_sessions: long (nullable = true)
 |-- total_views: long (nullable = true)
 |-- total_cart_adds: long (nullable = true)
 |-- avg_price_viewed: double (nullable = true)
 |-- favourite_brand: string (nullable = true)
 |-- top_viewed_category: string (nullable = true)
 |-- will_purchase: integer (nullable = false)



total_events,total_sessions,total_views,total_cart_adds,avg_price_viewed,favourite_brand,top_viewed_category,will_purchase
126,23,70,34,601.761,apple,electronics.smartphone,1
37,5,32,3,185.268125,lucente,electronics.smartphone,1
32,5,21,9,638.7880952380953,samsung,electronics.smartphone,1
405,56,381,23,317.1911023622048,xiaomi,electronics.smartphone,1
196,30,190,5,313.47852631578945,samsung,electronics.clocks,1
199,29,147,38,442.3252380952382,indesit,electronics.video.tv,1
86,12,82,3,65.90817073170733,omron,electronics.smartphone,1
270,19,263,6,442.6183650190114,bosch,appliances.kitchen.oven,1
118,16,110,6,196.39763636363637,vega,electronics.smartphone,1
205,11,203,1,308.1197536945813,elenberg,auto.accessories.player,1


### TASK 4 — Handle Nulls in Feature Columns
favourite_brand and top_viewed_category are nullable — some users never purchased from a specific brand or never viewed a categorised product. ML models cannot accept null values. We must handle them before training.



In [0]:

#  Check nulls in feature columns only
print('=== NULLS IN FEATURE COLUMNS ===')
null_check = df_ml.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in FEATURE_COLS
])
null_check.show(truncate=False)

=== NULLS IN FEATURE COLUMNS ===
+------------+--------------+-----------+---------------+----------------+---------------+-------------------+
|total_events|total_sessions|total_views|total_cart_adds|avg_price_viewed|favourite_brand|top_viewed_category|
+------------+--------------+-----------+---------------+----------------+---------------+-------------------+
|0           |0             |0          |0              |524             |4931659        |1019954            |
+------------+--------------+-----------+---------------+----------------+---------------+-------------------+



In [0]:
# Fill nulls with sensible defaults
# Numerical nulls -> 0 (user has no activity in that dimension)
# String nulls -> 'unknown' (placeholder — will be encoded later)

df_ml_clean = df_ml.fillna(0,subset=['total_events', 'total_sessions',
            'total_views', 'total_cart_adds', 'avg_price_viewed']) \
    .fillna('unknown', subset=['favourite_brand', 'top_viewed_category'])

# Confirm no nulls remain
print('=== NULLS AFTER FILLING ===')
df_ml_clean.select([
    F.count(F.when(F.col(c).isNull(), c)).alias(c)
    for c in df_ml_clean.columns
]).show(truncate=False)


=== NULLS AFTER FILLING ===
+------------+--------------+-----------+---------------+----------------+---------------+-------------------+-------------+
|total_events|total_sessions|total_views|total_cart_adds|avg_price_viewed|favourite_brand|top_viewed_category|will_purchase|
+------------+--------------+-----------+---------------+----------------+---------------+-------------------+-------------+
|0           |0             |0          |0              |0               |0              |0                  |0            |
+------------+--------------+-----------+---------------+----------------+---------------+-------------------+-------------+



### TASK 5 — Handle Class Imbalance with Class Weights
`What We Are Doing`
> Instead of modifying the data (oversampling/undersampling), we compute class weights and add them as a column. When the model sees a purchaser row, the higher weight tells it: 'this row is more important — getting it wrong costs more.' This forces the model to pay attention to the minority class.


In [0]:
# Compute class weights
# Formula: weight = total_rows / (num_classes * class_count)
# This is the standard sklearn-style balanced class weight formula

total      = df_ml_clean.count()
n_classes  = 2
count_0    = df_ml_clean.filter(col('will_purchase') == 0).count()
count_1    = df_ml_clean.filter(col('will_purchase') == 1).count()

weight_0   = round(total / (n_classes * count_0), 4)
weight_1   = round(total / (n_classes * count_1), 4)

print(f'Class 0 (non-purchaser) weight: {weight_0}')
print(f'Class 1 (purchaser) weight:     {weight_1}')
print(f'Weight ratio (1:0): {round(weight_1/weight_0, 1)}x')
print('Purchaser rows will be weighted higher to compensate for imbalance')


Class 0 (non-purchaser) weight: 0.5888
Class 1 (purchaser) weight:     3.3155
Weight ratio (1:0): 5.6x
Purchaser rows will be weighted higher to compensate for imbalance


In [0]:
# Add class_weight column to the dataset
df_weighted = df_ml_clean.withColumn(
    'class_weight',
    when(col('will_purchase') == 1, weight_1).otherwise(weight_0)
)

# Verify weights were applied correctly
df_weighted.groupBy('will_purchase', 'class_weight') \
    .count() \
    .orderBy('will_purchase') \
    .show()


+-------------+------------+-------+
|will_purchase|class_weight|  count|
+-------------+------------+-------+
|            0|      0.5888|4858746|
|            1|      3.3155| 862847|
+-------------+------------+-------+



### TASK 6 — Split Train / Test Sets
`What We Are Doing`
> Split the dataset into 80% training and 20% test using a stratified approach — preserving the class ratio in both sets. Add a split column so both sets can be saved in one Delta table and queried independently.

In [0]:
# Stratified train/test split using PySpark
# PySpark does not have a built-in stratified split like sklearn
# We split each class separately then union them together

TRAIN_RATIO = 0.8
TEST_RATIO  = 0.2
SEED        = 42   # fixed seed for reproducibility

# Split class 1 (purchasers) 80/20
df_class1 = df_weighted.filter(col('will_purchase') == 1)
train_1, test_1 = df_class1.randomSplit([TRAIN_RATIO, TEST_RATIO], seed=SEED)

# Split class 0 (non-purchasers) 80/20
df_class0 = df_weighted.filter(col('will_purchase') == 0)
train_0, test_0 = df_class0.randomSplit([TRAIN_RATIO, TEST_RATIO], seed=SEED)

# Union both classes back together
df_train = train_1.union(train_0).withColumn('split', F.lit('train'))
df_test  = test_1.union(test_0).withColumn('split', F.lit('test'))

# Combine into one dataset with split column
df_final = df_train.union(df_test)

print(f'Train set: {df_train.count():,} rows')
print(f'Test set:  {df_test.count():,} rows')
print(f'Total:     {df_final.count():,} rows')


Train set: 4,577,964 rows
Test set:  1,143,629 rows
Total:     5,721,593 rows


### TASK 7 — Validate Distribution in Train and Test Sets
`What We Are Doing`
> After splitting, always verify that the class ratio is preserved in both sets. Also check that feature distributions look similar between train and test — a very different distribution signals a bad split or data ordering issue.


In [0]:
# Validate class distribution in train and test
print('=== CLASS DISTRIBUTION VALIDATION ===')

for split_name in ['train', 'test']:
    df_split = df_final.filter(col('split') == split_name)
    total_s  = df_split.count()
    pos      = df_split.filter(col('will_purchase') == 1).count()
    neg      = total_s - pos
    print(f'\n{split_name.upper()} SET:')
    print(f'  Total:            {total_s:,}')
    print(f'  Purchasers (1):   {pos:,}  ({round(pos/total_s*100,2)}%)')
    print(f'  Non-purchasers (0):{neg:,}  ({round(neg/total_s*100,2)}%)')


=== CLASS DISTRIBUTION VALIDATION ===

TRAIN SET:
  Total:            4,577,964
  Purchasers (1):   690,781  (15.09%)
  Non-purchasers (0):3,887,183  (84.91%)

TEST SET:
  Total:            1,143,629
  Purchasers (1):   172,066  (15.05%)
  Non-purchasers (0):971,563  (84.95%)


In [0]:
# Validate feature distributions across train and test
print('=== FEATURE DISTRIBUTION COMPARISON: TRAIN vs TEST ===')

numeric_features = ['total_events', 'total_sessions', 'total_views',
                    'total_cart_adds', 'avg_price_viewed']

for feat in numeric_features:
    train_stats = df_train.agg(
        F.mean(feat).alias('mean'), F.stddev(feat).alias('std')
    ).collect()[0]
    test_stats = df_test.agg(
        F.mean(feat).alias('mean'), F.stddev(feat).alias('std')
    ).collect()[0]
    print(f'{feat}:')
    print(f'  Train: mean={train_stats["mean"]:.2f}  std={train_stats["std"]:.2f}')
    print(f'  Test:  mean={test_stats["mean"]:.2f}  std={test_stats["std"]:.2f}')


=== FEATURE DISTRIBUTION COMPARISON: TRAIN vs TEST ===
total_events:
  Train: mean=22.41  std=58.56
  Test:  mean=22.33  std=56.71
total_sessions:
  Train: mean=4.60  std=16.96
  Test:  mean=4.58  std=12.46
total_views:
  Train: mean=21.23  std=56.61
  Test:  mean=21.16  std=54.69
total_cart_adds:
  Train: mean=0.81  std=2.97
  Test:  mean=0.81  std=2.98
avg_price_viewed:
  Train: mean=306.77  std=308.78
  Test:  mean=307.24  std=308.94


### TASK 8 — Save ML-Ready Dataset as Delta Table
Save the final ML dataset as a Delta table in the Gold layer. This is the input to Day 6 model training. Saving it as Delta means it is versioned, queryable, and auditable — the model can always be retrained from the exact same data snapshot.


In [0]:
# Save the ML-ready dataset
df_final.write \
    .format('delta') \
    .mode('overwrite') \
    .option('overwriteSchema', 'true') \
    .saveAsTable(f'{CATALOG}.gold.ml_dataset')

print(f'ML dataset saved to {CATALOG}.gold.ml_dataset')
spark.sql(f'DESCRIBE DETAIL {CATALOG}.gold.ml_dataset') \
    .select('numFiles', 'sizeInBytes').show()


ML dataset saved to ecommerce.gold.ml_dataset
+--------+-----------+
|numFiles|sizeInBytes|
+--------+-----------+
|       8|   24215104|
+--------+-----------+



In [0]:
# Final verification: query train and test sets from the saved table
df_saved = spark.table(f'{CATALOG}.gold.ml_dataset')

print('=== FINAL ML DATASET SUMMARY ===')
df_saved.groupBy('split', 'will_purchase') \
    .count() \
    .orderBy('split', 'will_purchase') \
    .show()

print(f'Columns: {df_saved.columns}')
print(f'Total rows: {df_saved.count():,}')
display(df_saved.limit(10))


=== FINAL ML DATASET SUMMARY ===
+-----+-------------+-------+
|split|will_purchase|  count|
+-----+-------------+-------+
| test|            0| 970299|
| test|            1| 172082|
|train|            0|3888447|
|train|            1| 690765|
+-----+-------------+-------+

Columns: ['total_events', 'total_sessions', 'total_views', 'total_cart_adds', 'avg_price_viewed', 'favourite_brand', 'top_viewed_category', 'will_purchase', 'class_weight', 'split']
Total rows: 5,721,593


total_events,total_sessions,total_views,total_cart_adds,avg_price_viewed,favourite_brand,top_viewed_category,will_purchase,class_weight,split
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
1,1,0,1,0.0,unknown,unknown,0,0.5888,test
