# A/B Test Analysis Framework

**Author**: Xiaoxiao Wu  
**Purpose**: Reusable A/B testing analysis module with:
- Power analysis & sample size calculation
- Two-sample t-test & chi-square test
- Sequential testing (always-valid p-values)
- Uplift modeling for heterogeneous treatment effects
- Visualization of results

**Context**: Built from experience running large-scale experiments at Uber China (pricing elasticity, dispatching algorithms, user segmentation campaigns).

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from statsmodels.stats.power import TTestIndPower, NormalIndPower
from statsmodels.stats.proportion import proportions_ztest
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

## 1. Power Analysis & Sample Size Calculation

Before running any experiment, we need to determine the minimum sample size to detect a meaningful effect.

In [None]:
def calculate_sample_size(
    baseline_rate: float,
    min_detectable_effect: float,
    alpha: float = 0.05,
    power: float = 0.80,
    test_type: str = 'proportions'
) -> dict:
    """
    Calculate minimum sample size per group for an A/B test.
    
    Args:
        baseline_rate: Current conversion rate (e.g., 0.05 for 5%)
        min_detectable_effect: Minimum relative lift to detect (e.g., 0.10 for 10% lift)
        alpha: Significance level (Type I error rate)
        power: Statistical power (1 - Type II error rate)
        test_type: 'proportions' or 'continuous'
    
    Returns:
        dict with sample size and experiment parameters
    """
    treatment_rate = baseline_rate * (1 + min_detectable_effect)
    
    if test_type == 'proportions':
        # Cohen's h for proportions
        h = 2 * np.arcsin(np.sqrt(treatment_rate)) - 2 * np.arcsin(np.sqrt(baseline_rate))
        analysis = NormalIndPower()
        n = analysis.solve_power(effect_size=h, alpha=alpha, power=power, alternative='two-sided')
    else:
        # Cohen's d for continuous metrics
        d = min_detectable_effect  # Standardized effect size
        analysis = TTestIndPower()
        n = analysis.solve_power(effect_size=d, alpha=alpha, power=power, alternative='two-sided')
    
    n_per_group = int(np.ceil(n))
    
    return {
        'sample_size_per_group': n_per_group,
        'total_sample_size': n_per_group * 2,
        'baseline_rate': baseline_rate,
        'expected_treatment_rate': treatment_rate,
        'min_detectable_effect': min_detectable_effect,
        'alpha': alpha,
        'power': power,
    }

# Example: Current conversion rate is 5%, want to detect 10% relative lift
result = calculate_sample_size(baseline_rate=0.05, min_detectable_effect=0.10)
print(f"Sample size needed per group: {result['sample_size_per_group']:,}")
print(f"Total participants needed:    {result['total_sample_size']:,}")
print(f"Detecting: {result['baseline_rate']*100:.1f}% → {result['expected_treatment_rate']*100:.1f}% ({result['min_detectable_effect']*100:.0f}% lift)")

## 2. A/B Test Analysis — Conversion Rate (Proportions Test)

In [None]:
def analyze_ab_test_proportions(
    control_conversions: int, control_total: int,
    treatment_conversions: int, treatment_total: int,
    alpha: float = 0.05
) -> dict:
    """
    Analyze an A/B test for binary outcomes (conversion rates).
    Uses two-proportions z-test.
    """
    # Conversion rates
    p_control = control_conversions / control_total
    p_treatment = treatment_conversions / treatment_total
    lift = (p_treatment - p_control) / p_control
    
    # Two-proportions z-test
    count = np.array([treatment_conversions, control_conversions])
    nobs = np.array([treatment_total, control_total])
    z_stat, p_value = proportions_ztest(count, nobs, alternative='two-sided')
    
    # Confidence interval for the difference
    se = np.sqrt(p_control * (1 - p_control) / control_total + 
                 p_treatment * (1 - p_treatment) / treatment_total)
    z_crit = stats.norm.ppf(1 - alpha / 2)
    diff = p_treatment - p_control
    ci_lower = diff - z_crit * se
    ci_upper = diff + z_crit * se
    
    significant = p_value < alpha
    
    return {
        'control_rate': p_control,
        'treatment_rate': p_treatment,
        'absolute_diff': diff,
        'relative_lift': lift,
        'z_statistic': z_stat,
        'p_value': p_value,
        'ci_95': (ci_lower, ci_upper),
        'significant': significant,
        'recommendation': 'SHIP IT ✅' if significant and lift > 0 else 
                         ('REVERT ❌' if significant and lift < 0 else 'INCONCLUSIVE ⚠️')
    }

# Example test result
result = analyze_ab_test_proportions(
    control_conversions=500, control_total=10000,
    treatment_conversions=560, treatment_total=10000
)

print(f"Control rate:    {result['control_rate']:.2%}")
print(f"Treatment rate:  {result['treatment_rate']:.2%}")
print(f"Relative lift:   {result['relative_lift']:+.1%}")
print(f"P-value:         {result['p_value']:.4f}")
print(f"95% CI:          [{result['ci_95'][0]:.4f}, {result['ci_95'][1]:.4f}]")
print(f"Significant:     {result['significant']}")
print(f"Recommendation:  {result['recommendation']}")

## 3. A/B Test Analysis — Continuous Metrics (Revenue, Time-on-Trip, etc.)

For continuous metrics like revenue per user, average order value, or time-on-trip.

In [None]:
def analyze_ab_test_continuous(
    control_data: np.ndarray,
    treatment_data: np.ndarray,
    alpha: float = 0.05,
    use_bootstrap: bool = False,
    n_bootstrap: int = 10000
) -> dict:
    """
    Analyze A/B test for continuous outcomes.
    Supports parametric (Welch's t-test) and non-parametric (bootstrap) approaches.
    """
    mean_c = np.mean(control_data)
    mean_t = np.mean(treatment_data)
    lift = (mean_t - mean_c) / mean_c if mean_c != 0 else np.inf
    
    if use_bootstrap:
        # Bootstrap confidence interval
        diffs = []
        for _ in range(n_bootstrap):
            boot_c = np.random.choice(control_data, size=len(control_data), replace=True)
            boot_t = np.random.choice(treatment_data, size=len(treatment_data), replace=True)
            diffs.append(np.mean(boot_t) - np.mean(boot_c))
        
        ci_lower = np.percentile(diffs, 100 * alpha / 2)
        ci_upper = np.percentile(diffs, 100 * (1 - alpha / 2))
        p_value = np.mean(np.array(diffs) <= 0) * 2  # Two-sided
        p_value = min(p_value, 2 - p_value)
        method = 'Bootstrap'
    else:
        # Welch's t-test (unequal variances)
        t_stat, p_value = stats.ttest_ind(treatment_data, control_data, equal_var=False)
        se = np.sqrt(np.var(control_data)/len(control_data) + np.var(treatment_data)/len(treatment_data))
        z_crit = stats.t.ppf(1 - alpha/2, df=min(len(control_data), len(treatment_data)) - 1)
        diff = mean_t - mean_c
        ci_lower = diff - z_crit * se
        ci_upper = diff + z_crit * se
        method = "Welch's t-test"
    
    significant = p_value < alpha
    
    return {
        'control_mean': mean_c,
        'treatment_mean': mean_t,
        'absolute_diff': mean_t - mean_c,
        'relative_lift': lift,
        'p_value': p_value,
        'ci_95': (ci_lower, ci_upper),
        'significant': significant,
        'method': method,
        'recommendation': 'SHIP IT ✅' if significant and lift > 0 else
                         ('REVERT ❌' if significant and lift < 0 else 'INCONCLUSIVE ⚠️')
    }

# Simulate revenue data
control_rev = np.random.lognormal(mean=3.5, sigma=0.8, size=5000)
treatment_rev = np.random.lognormal(mean=3.55, sigma=0.8, size=5000)

result = analyze_ab_test_continuous(control_rev, treatment_rev)
print(f"Control mean:    ${result['control_mean']:.2f}")
print(f"Treatment mean:  ${result['treatment_mean']:.2f}")
print(f"Relative lift:   {result['relative_lift']:+.1%}")
print(f"P-value:         {result['p_value']:.4f}")
print(f"Method:          {result['method']}")
print(f"Recommendation:  {result['recommendation']}")

## 4. Visualization

In [None]:
def plot_ab_results(result: dict, metric_name: str = 'Conversion Rate'):
    """Visualize A/B test results with confidence intervals."""
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Plot 1: Bar chart with CI
    ax = axes[0]
    groups = ['Control', 'Treatment']
    means = [result.get('control_rate', result.get('control_mean')),
             result.get('treatment_rate', result.get('treatment_mean'))]
    colors = ['#2196F3', '#4CAF50' if result['relative_lift'] > 0 else '#F44336']
    
    bars = ax.bar(groups, means, color=colors, alpha=0.8, width=0.5)
    ax.set_ylabel(metric_name)
    ax.set_title(f'{metric_name}: Control vs Treatment')
    
    for bar, val in zip(bars, means):
        ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
                f'{val:.2%}' if val < 1 else f'${val:.2f}',
                ha='center', va='bottom', fontweight='bold')
    
    # Plot 2: Effect size with CI
    ax = axes[1]
    ci = result['ci_95']
    diff = result['absolute_diff']
    
    ax.errorbar(0, diff, yerr=[[diff - ci[0]], [ci[1] - diff]],
                fmt='o', markersize=10, capsize=8, capthick=2, linewidth=2,
                color='green' if result['significant'] else 'gray')
    ax.axhline(y=0, color='red', linestyle='--', alpha=0.5)
    ax.set_xlim(-1, 1)
    ax.set_xticks([])
    ax.set_ylabel(f'Difference in {metric_name}')
    ax.set_title(f"Effect Size (p={result['p_value']:.4f}) — {result['recommendation']}")
    
    plt.tight_layout()
    plt.savefig('results/ab_test_results.png', dpi=150, bbox_inches='tight')
    plt.show()

# plot_ab_results(result, metric_name='Revenue per User')

## 5. Multiple Testing Correction (Bonferroni & FDR)

When running multiple metrics simultaneously, correct for false discoveries.

In [None]:
from statsmodels.stats.multitest import multipletests

def correct_multiple_tests(p_values: list, method: str = 'fdr_bh', alpha: float = 0.05):
    """
    Correct for multiple hypothesis testing.
    Methods: 'bonferroni', 'fdr_bh' (Benjamini-Hochberg), 'holm'
    """
    reject, corrected_p, _, _ = multipletests(p_values, alpha=alpha, method=method)
    return pd.DataFrame({
        'original_p': p_values,
        'corrected_p': corrected_p,
        'significant': reject,
        'method': method
    })

# Example: testing 5 metrics simultaneously
p_values = [0.003, 0.012, 0.048, 0.15, 0.72]
metrics = ['Conversion Rate', 'Revenue/User', 'Bounce Rate', 'Session Duration', 'Page Views']

results = correct_multiple_tests(p_values)
results.index = metrics
print("Multiple Testing Correction (Benjamini-Hochberg FDR):")
print(results)