# Chapter 4: Metric Design and Variance Reduction

This notebook demonstrated four key techniques for metric design and variance reduction:

1. **Proxy Metric Validation**: Using correlation analysis to select the best leading indicator for long-term business goals
2. **Variance Comparison**: Understanding why different metric types (binomial, count, continuous) have dramatically different variances
3. **CUPED**: Implementing the most powerful variance reduction technique using pre-experiment data
4. **Metric Transformation**: Applying winsorization and log transformation to handle skewed distributions with outliers

## Setup: Install Required Packages

If running for the first time, uncomment the following line to install dependencies:

In [None]:
# Uncomment if packages are not installed
# !pip install pandas numpy scipy matplotlib

## Section 1.2: Validating Proxy Metrics - Correlation Analysis

This example demonstrates how to compute a correlation table to evaluate candidate proxy metrics against a lagging indicator (12-month LTV).

In [None]:
import pandas as pd
import numpy as np

# Sample historical user data (first month metrics + 12-month outcomes)
# In practice, this would come from your data warehouse
np.random.seed(42)
n_users = 10000

user_data = pd.DataFrame({
    'user_id': range(n_users),
    # First month metrics (potential proxies) - simulated for demonstration
    'weekly_revenue_per_user': np.random.exponential(20, n_users),
    'session_success_rate': np.random.beta(2, 8, n_users),
    'items_per_cart': np.random.poisson(2.5, n_users),
    'number_of_wishlist_adds': np.random.poisson(1.2, n_users),
    'number_of_searches_per_week': np.random.poisson(3, n_users),
    'time_spent_on_product_pages': np.random.exponential(120, n_users),
    # Lagging indicator (measured 12 months later)
    # In reality, LTV is correlated with early behavior
    'ltv_12_months': np.random.exponential(150, n_users)
})

# Make LTV correlated with early metrics (simulating real relationship)
user_data['ltv_12_months'] += (
    0.7 * user_data['weekly_revenue_per_user'] +
    0.5 * user_data['session_success_rate'] * 100 +
    0.3 * user_data['items_per_cart'] * 10
)

# Calculate correlation of each proxy with the lagging indicator
proxy_metrics = [
    'weekly_revenue_per_user',
    'session_success_rate', 
    'items_per_cart',
    'number_of_wishlist_adds',
    'number_of_searches_per_week',
    'time_spent_on_product_pages'
]

correlation_results = []
for metric in proxy_metrics:
    corr = user_data[metric].corr(user_data['ltv_12_months'])
    correlation_results.append({'Proxy Metric': metric, 'Correlation': corr})

# Display results sorted by correlation strength
results_df = pd.DataFrame(correlation_results).sort_values('Correlation', ascending=False)
print(results_df)

## Section 3.1: The Variance Problem - Comparing Metric Types

This example computes variance for three different metric types (binomial, count, continuous) and demonstrates why revenue metrics require much larger sample sizes than conversion metrics.

In [None]:
import numpy as np

# Simulate 10,000 users with different metric types
n_users = 10000

# 1. Binomial Metric: Conversion (0 or 1)
conversion_rate = 0.05
conversions = np.random.binomial(1, conversion_rate, n_users)
var_binomial = np.var(conversions, ddof=1)
print(f"Binomial (Conversion) - Variance: {var_binomial:.4f}")
print(f"  Theoretical: p(1-p) = {conversion_rate * (1-conversion_rate):.4f}\n")

# 2. Count Metric: Number of purchases per user
# 95% make 0, 4% make 1, 1% make 2 purchases
purchase_counts = np.random.choice([0, 1, 2], n_users, p=[0.95, 0.04, 0.01])
var_count = np.var(purchase_counts, ddof=1)
expected_count = 0.95*0 + 0.04*1 + 0.01*2
expected_squared = 0.95*0**2 + 0.04*1**2 + 0.01*2**2
print(f"Count (Purchases) - Variance: {var_count:.4f}")
print(f"  Theoretical: E[C²] - E[C]² = {expected_squared - expected_count**2:.4f}\n")

# 3. Continuous Metric: Revenue per user
# Most users spend $0, converters spend between $10-$100, rare "whales" spend $500+
revenue = np.zeros(n_users)
converters = np.random.rand(n_users) < conversion_rate
revenue[converters] = np.random.exponential(50, converters.sum())
# Add a few whale users
whale_mask = np.random.rand(n_users) < 0.001
revenue[whale_mask] = np.random.uniform(500, 1000, whale_mask.sum())
var_revenue = np.var(revenue, ddof=1)
print(f"Continuous (Revenue) - Variance: {var_revenue:.2f}")
print(f"  Standard deviation: ${np.sqrt(var_revenue):.2f}")
print(f"  Mean revenue: ${np.mean(revenue):.2f}\n")

# Comparison: Required sample size scales with variance
print("Relative sample size requirements (holding MDE constant):")
print(f"  Binomial: 1.0x (baseline)")
print(f"  Count: {var_count/var_binomial:.1f}x")
print(f"  Revenue: {var_revenue/var_binomial:.0f}x")

## Section 4.3: CUPED Implementation

This example demonstrates a complete implementation of CUPED (Controlled-experiment Using Pre-Experiment Data) for variance reduction in A/B testing.

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

# Sample data: experiment results with pre-experiment covariate
data = pd.DataFrame({
    'user_id': range(1000),
    'group': np.random.choice(['control', 'treatment'], 1000),
    'revenue_in_experiment': np.random.exponential(50, 1000),  # Y
    'revenue_30_days_prior': np.random.exponential(50, 1000)   # X (covariate)
})

# Introduce a treatment effect: +5% revenue for treatment group
treatment_mask = data['group'] == 'treatment'
data.loc[treatment_mask, 'revenue_in_experiment'] *= 1.05

# Step 1: Define the outcome metric (Y) and covariate (X)
Y = data['revenue_in_experiment'].values
X = data['revenue_30_days_prior'].values

# Step 2: Calculate covariance and variance (pooled across both groups)
cov_XY = np.cov(X, Y)[0, 1]  # Covariance between X and Y
var_X = np.var(X, ddof=1)     # Variance of X

# Step 3: Calculate optimal theta
theta = cov_XY / var_X
print(f"Optimal theta: {theta:.4f}")
print(f"Correlation: {np.corrcoef(X, Y)[0, 1]:.4f}")

# Step 4: Compute CUPED-adjusted metric for each user
data['Y_cuped'] = data['revenue_in_experiment'] - theta * data['revenue_30_days_prior']

# Step 5: Compare variance reduction
var_original = data['revenue_in_experiment'].var()
var_cuped = data['Y_cuped'].var()
variance_reduction = (1 - var_cuped / var_original) * 100
print(f"\nVariance Reduction: {variance_reduction:.1f}%")

# Step 6: Perform t-test on original and CUPED-adjusted metrics
control_original = data[data['group'] == 'control']['revenue_in_experiment']
treatment_original = data[data['group'] == 'treatment']['revenue_in_experiment']

control_cuped = data[data['group'] == 'control']['Y_cuped']
treatment_cuped = data[data['group'] == 'treatment']['Y_cuped']

# Original t-test
t_stat_orig, p_val_orig = stats.ttest_ind(treatment_original, control_original)
print(f"\n--- Original Metric ---")
print(f"Control Mean: ${control_original.mean():.2f}")
print(f"Treatment Mean: ${treatment_original.mean():.2f}")
print(f"p-value: {p_val_orig:.4f}")

# CUPED-adjusted t-test
t_stat_cuped, p_val_cuped = stats.ttest_ind(treatment_cuped, control_cuped)
print(f"\n--- CUPED-Adjusted Metric ---")
print(f"Control Mean: ${control_cuped.mean():.2f}")
print(f"Treatment Mean: ${treatment_cuped.mean():.2f}")
print(f"p-value: {p_val_cuped:.4f}")
print(f"\nSensitivity improvement: {p_val_orig / p_val_cuped:.2f}x")

## Section 5.2: Metric Transformation - Winsorization and Log Transformation

This example demonstrates how winsorization and log transformation can reduce variance in skewed metrics with outliers.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

# Generate skewed revenue data with outliers
np.random.seed(42)
n = 1000
# Most users spend little, some spend moderate amounts, a few are whales
revenue = np.concatenate([
    np.random.exponential(20, 950),      # 95% normal users
    np.random.exponential(100, 45),      # 4.5% higher spenders
    np.random.uniform(500, 2000, 5)      # 0.5% whales
])

print("Original Revenue Metric:")
print(f"  Mean: ${revenue.mean():.2f}")
print(f"  Std Dev: ${revenue.std():.2f}")
print(f"  Variance: {revenue.var():.2f}\n")

# Apply Winsorization at 99th percentile
percentile_99 = np.percentile(revenue, 99)
revenue_winsorized = np.clip(revenue, 0, percentile_99)
print(f"Winsorized Revenue (capped at 99th percentile: ${percentile_99:.2f}):")
print(f"  Mean: ${revenue_winsorized.mean():.2f}")
print(f"  Std Dev: ${revenue_winsorized.std():.2f}")
print(f"  Variance: {revenue_winsorized.var():.2f}")
print(f"  Variance reduction: {(1 - revenue_winsorized.var()/revenue.var())*100:.1f}%\n")

# Apply Log Transformation (log(revenue + 1) to handle zeros)
revenue_log = np.log(revenue + 1)
print(f"Log-Transformed Revenue (log(revenue + 1)):")
print(f"  Mean: {revenue_log.mean():.4f}")
print(f"  Std Dev: {revenue_log.std():.4f}")
print(f"  Variance: {revenue_log.var():.4f}")
print(f"  Variance reduction: {(1 - revenue_log.var()/revenue.var())*100:.1f}%\n")

# Demonstrate impact on hypothesis testing
# Split into two groups (simulate A/B test with 5% revenue lift in B)
group_a = revenue[:500]
group_b = revenue[500:] * 1.05  # 5% lift

# Test on original metric
t_stat_orig, p_val_orig = stats.ttest_ind(group_b, group_a)
print(f"T-test on Original Revenue: p-value = {p_val_orig:.4f}")

# Test on winsorized metric
group_a_wins = np.clip(group_a, 0, percentile_99)
group_b_wins = np.clip(group_b, 0, percentile_99)
t_stat_wins, p_val_wins = stats.ttest_ind(group_b_wins, group_a_wins)
print(f"T-test on Winsorized Revenue: p-value = {p_val_wins:.4f}")

# Test on log-transformed metric
group_a_log = np.log(group_a + 1)
group_b_log = np.log(group_b + 1)
t_stat_log, p_val_log = stats.ttest_ind(group_b_log, group_a_log)
print(f"T-test on Log Revenue: p-value = {p_val_log:.4f}")