# ðŸ”§ Data Preprocessing: Criteo Attribution Dataset

This notebook prepares the Criteo data for attribution modeling by:
1. Creating user journey sequences
2. Engineering features (time deltas, position encoding, attention masks)
3. Building train/validation/test splits
4. Saving processed data for downstream models

In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import pickle
from tqdm import tqdm
import warnings
import json
from datetime import datetime
warnings.filterwarnings('ignore')

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"PyTorch: {torch.__version__}")
print(f"Device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Paths
DATA_RAW = Path('../data/raw/criteo')
DATA_PROCESSED = Path('../data/processed')
DATA_PROCESSED.mkdir(exist_ok=True)

PyTorch: 2.6.0+cu124
Device: cuda
GPU: NVIDIA GeForce RTX 4070 Laptop GPU


## 1. Load Full Dataset

The Criteo dataset has ~16M rows. We'll use a configurable sample size.

In [2]:
# Configuration - adjust based on your system resources
# With 32GB RAM, you can handle 8-10M rows comfortably
SAMPLE_ROWS = 8_000_000  # Increased from 5M for better model quality

print("Loading Criteo Attribution Dataset...")
print(f"Sample size: {SAMPLE_ROWS:,} rows")

df = pd.read_csv(
    DATA_RAW / 'criteo_attribution_dataset.tsv.gz', 
    sep='\t', 
    nrows=SAMPLE_ROWS
)

print(f"\nLoaded {len(df):,} impressions")
print(f"Unique users: {df['uid'].nunique():,}")
print(f"Unique campaigns: {df['campaign'].nunique():,}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1e9:.2f} GB")

Loading Criteo Attribution Dataset...
Sample size: 8,000,000 rows

Loaded 8,000,000 impressions
Unique users: 3,759,156
Unique campaigns: 675
Memory usage: 1.41 GB


In [3]:
# Quick data overview
print("\n--- Data Overview ---")
print(f"Columns: {list(df.columns)}")
print(f"\nConversion rate: {df['conversion'].mean():.4%}")
print(f"Click rate: {df['click'].mean():.4%}")
print(f"\nTimestamp range: {df['timestamp'].min()} to {df['timestamp'].max()}")


--- Data Overview ---
Columns: ['timestamp', 'uid', 'campaign', 'conversion', 'conversion_timestamp', 'conversion_id', 'attribution', 'click', 'click_pos', 'click_nb', 'cost', 'cpo', 'time_since_last_click', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9']

Conversion rate: 4.8223%
Click rate: 35.2893%

Timestamp range: 0 to 1256682


## 2. Create User Journey Sequences

We create fixed-length sequences per user with:
- Campaign IDs (encoded)
- Click indicators
- Costs
- Time deltas (globally normalized)
- Position encoding
- Attention masks (for padding)

In [4]:
# Sort by user and timestamp
df = df.sort_values(['uid', 'timestamp'])

# Encode campaigns
campaign_encoder = LabelEncoder()
df['campaign_encoded'] = campaign_encoder.fit_transform(df['campaign'])

print(f"Unique campaigns: {len(campaign_encoder.classes_)}")
print(f"Campaign ID range: 0 to {len(campaign_encoder.classes_) - 1}")

Unique campaigns: 675
Campaign ID range: 0 to 674


In [5]:
# Compute global time delta statistics for normalization
print("Computing global time delta statistics...")
all_time_deltas = []
for uid, group in tqdm(df.groupby('uid'), desc='Computing stats'):
    timestamps = group['timestamp'].values
    if len(timestamps) > 1:
        deltas = np.diff(timestamps)
        all_time_deltas.extend(deltas[deltas > 0].tolist())

TIME_DELTA_MEAN = np.mean(all_time_deltas) if all_time_deltas else 1.0
TIME_DELTA_STD = np.std(all_time_deltas) if all_time_deltas else 1.0
print(f"Time delta mean: {TIME_DELTA_MEAN:.2f}")
print(f"Time delta std: {TIME_DELTA_STD:.2f}")

Computing global time delta statistics...


Computing stats: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3759156/3759156 [00:59<00:00, 63233.63it/s]


Time delta mean: 169292.26
Time delta std: 214311.97


In [6]:
# Configuration
MAX_SEQ_LEN = 20  # Maximum impressions per journey

def create_user_sequences(df, max_len=MAX_SEQ_LEN):
    """Create fixed-length sequences for each user with all required features."""
    sequences = []
    seq_lengths = []  # Track for statistics
    
    for uid, group in tqdm(df.groupby('uid'), desc='Creating sequences'):
        group = group.sort_values('timestamp')
        
        # Get user's journey data (last N impressions)
        campaigns = group['campaign_encoded'].values[-max_len:]
        timestamps = group['timestamp'].values[-max_len:]
        clicks = group['click'].values[-max_len:]
        costs = group['cost'].values[-max_len:]
        
        # User-level targets
        converted = int(group['conversion'].max())
        total_cost = float(group['cost'].sum())
        
        # Original sequence length (before padding)
        seq_len = len(campaigns)
        seq_lengths.append(seq_len)
        
        # Pad sequences if needed
        if seq_len < max_len:
            pad_len = max_len - seq_len
            campaigns = np.pad(campaigns, (pad_len, 0), constant_values=0)  # 0 = padding
            timestamps = np.pad(timestamps, (pad_len, 0), constant_values=0)
            clicks = np.pad(clicks, (pad_len, 0), constant_values=0)
            costs = np.pad(costs, (pad_len, 0), constant_values=0)
        
        # Calculate time deltas (globally normalized)
        time_deltas = np.diff(timestamps, prepend=timestamps[0])
        time_deltas = (time_deltas - TIME_DELTA_MEAN) / (TIME_DELTA_STD + 1e-8)
        time_deltas = np.clip(time_deltas, -10, 10)  # Clip outliers
        
        # Create attention mask (1 = real token, 0 = padding)
        mask = np.zeros(max_len, dtype=np.float32)
        mask[-seq_len:] = 1.0  # Real tokens are at the end after padding
        
        # Position encoding (relative positions)
        positions = np.arange(max_len, dtype=np.int64)
        
        # Campaigns +1 so 0 can be padding index in embeddings
        campaigns = campaigns + 1
        
        sequences.append({
            'uid': uid,
            'campaigns': campaigns.astype(np.int64),
            'clicks': clicks.astype(np.float32),
            'costs': costs.astype(np.float32),
            'time_deltas': time_deltas.astype(np.float32),
            'positions': positions,
            'mask': mask,
            'seq_len': min(seq_len, max_len),
            'converted': converted,
            'total_cost': total_cost
        })
    
    return sequences, seq_lengths

sequences, seq_lengths = create_user_sequences(df)
print(f"\nCreated {len(sequences):,} user sequences")

Creating sequences: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3759156/3759156 [15:47<00:00, 3968.16it/s]


Created 3,759,156 user sequences





In [7]:
# Sequence length statistics
print("\n--- Sequence Length Distribution ---")
print(f"Min: {min(seq_lengths)}")
print(f"Max: {max(seq_lengths)}")
print(f"Mean: {np.mean(seq_lengths):.2f}")
print(f"Median: {np.median(seq_lengths):.0f}")
print(f"Sequences with length = MAX_SEQ_LEN: {sum(1 for s in seq_lengths if s >= MAX_SEQ_LEN):,}")


--- Sequence Length Distribution ---
Min: 1
Max: 20
Mean: 2.11
Median: 1
Sequences with length = MAX_SEQ_LEN: 9,200


## 3. Train/Validation/Test Split

In [8]:
# Stratified split to maintain conversion rate distribution
labels = [s['converted'] for s in sequences]

# 70% train, 15% val, 15% test
train_seq, temp_seq = train_test_split(
    sequences, 
    test_size=0.3, 
    random_state=42, 
    stratify=labels
)

temp_labels = [s['converted'] for s in temp_seq]
val_seq, test_seq = train_test_split(
    temp_seq, 
    test_size=0.5, 
    random_state=42, 
    stratify=temp_labels
)

print("--- Split Sizes ---")
print(f"Train: {len(train_seq):,} ({len(train_seq)/len(sequences):.1%})")
print(f"Val:   {len(val_seq):,} ({len(val_seq)/len(sequences):.1%})")
print(f"Test:  {len(test_seq):,} ({len(test_seq)/len(sequences):.1%})")

print("\n--- Conversion Rates (should be similar across splits) ---")
print(f"Train: {sum(s['converted'] for s in train_seq) / len(train_seq):.4%}")
print(f"Val:   {sum(s['converted'] for s in val_seq) / len(val_seq):.4%}")
print(f"Test:  {sum(s['converted'] for s in test_seq) / len(test_seq):.4%}")

--- Split Sizes ---
Train: 2,631,409 (70.0%)
Val:   563,873 (15.0%)
Test:  563,874 (15.0%)

--- Conversion Rates (should be similar across splits) ---
Train: 5.0185%
Val:   5.0185%
Test:  5.0187%


## 4. Save Processed Data

In [9]:
# Comprehensive config for reproducibility
config = {
    'n_campaigns': len(campaign_encoder.classes_) + 1,  # +1 for padding token
    'max_seq_len': MAX_SEQ_LEN,
    'sample_rows': SAMPLE_ROWS,
    'n_train': len(train_seq),
    'n_val': len(val_seq),
    'n_test': len(test_seq),
    'conversion_rate': sum(labels) / len(labels),
    'time_delta_mean': float(TIME_DELTA_MEAN),
    'time_delta_std': float(TIME_DELTA_STD),
    'created_at': datetime.now().isoformat(),
    'feature_dim': 4  # campaigns, clicks, costs, time_deltas
}

# Save sequences
processed_data = {
    'train_sequences': train_seq,
    'val_sequences': val_seq,
    'test_sequences': test_seq,
    'config': config
}

with open(DATA_PROCESSED / 'processed_sequences.pkl', 'wb') as f:
    pickle.dump(processed_data, f)

# Save encoders
encoders = {
    'campaign': campaign_encoder
}
with open(DATA_PROCESSED / 'encoders.pkl', 'wb') as f:
    pickle.dump(encoders, f)

# Save config as JSON for easy inspection
with open(DATA_PROCESSED / 'config.json', 'w') as f:
    json.dump(config, f, indent=2)

print(f"\nâœ… Saved to {DATA_PROCESSED}")
print(f"\n--- Config ---")
for k, v in config.items():
    if isinstance(v, float):
        print(f"  {k}: {v:.6f}")
    else:
        print(f"  {k}: {v}")


âœ… Saved to ..\data\processed

--- Config ---
  n_campaigns: 676
  max_seq_len: 20
  sample_rows: 8000000
  n_train: 2631409
  n_val: 563873
  n_test: 563874
  conversion_rate: 0.050185
  time_delta_mean: 169292.264153
  time_delta_std: 214311.971348
  created_at: 2025-12-15T05:05:43.889145
  feature_dim: 4


In [10]:
# Verify saved data
print("\n--- Verifying Saved Data ---")
with open(DATA_PROCESSED / 'processed_sequences.pkl', 'rb') as f:
    loaded = pickle.load(f)

print(f"Train sequences loaded: {len(loaded['train_sequences']):,}")
print(f"Val sequences loaded: {len(loaded['val_sequences']):,}")
print(f"Test sequences loaded: {len(loaded['test_sequences']):,}")

# Sample sequence structure
sample = loaded['train_sequences'][0]
print(f"\nSample sequence keys: {list(sample.keys())}")
print(f"Campaigns shape: {sample['campaigns'].shape}")
print(f"Mask shape: {sample['mask'].shape}")


--- Verifying Saved Data ---
Train sequences loaded: 2,631,409
Val sequences loaded: 563,873
Test sequences loaded: 563,874

Sample sequence keys: ['uid', 'campaigns', 'clicks', 'costs', 'time_deltas', 'positions', 'mask', 'seq_len', 'converted', 'total_cost']
Campaigns shape: (20,)
Mask shape: (20,)


In [11]:
print("\n" + "="*60)
print("âœ… PREPROCESSING COMPLETE!")
print("="*60)
print(f"\nReady for modeling with {len(train_seq):,} training sequences")
print(f"\nFiles created:")
print(f"  - {DATA_PROCESSED / 'processed_sequences.pkl'}")
print(f"  - {DATA_PROCESSED / 'encoders.pkl'}")
print(f"  - {DATA_PROCESSED / 'config.json'}")


âœ… PREPROCESSING COMPLETE!

Ready for modeling with 2,631,409 training sequences

Files created:
  - ..\data\processed\processed_sequences.pkl
  - ..\data\processed\encoders.pkl
  - ..\data\processed\config.json


---
**Next:** [03_rule_based_models.ipynb](03_rule_based_models.ipynb) - Implement baseline attribution models