# Chapter 2: The Statistical Engine of Experimentation

This notebook demonstrated all key statistical tests and calculations for A/B testing:

1. **Confidence Interval Calculation**: Functions to calculate CIs for both proportions and continuous metrics
2. **Sample Size Calculation**: Determining required sample sizes for both metric types  
3. **Z-test for Proportions**: Testing conversion rates and other binary metrics
4. **Welch's t-test**: Testing continuous metrics like revenue without assuming equal variances

## Setup: Install Required Packages

The following packages are required for this notebook. Uncomment and run if needed.

In [None]:
# Install required packages (uncomment if needed)
# !pip install numpy scipy statsmodels

## Import Libraries

In [None]:
import numpy as np
from scipy import stats
from statsmodels.stats.proportion import proportions_ztest, proportion_confint
from statsmodels.stats.power import zt_ind_solve_power, tt_ind_solve_power
from statsmodels.stats.proportion import proportion_effectsize

## Section 2.3: Calculating Confidence Intervals for the Difference

This section demonstrates how to calculate confidence intervals for the difference between two groups, for both proportions and continuous metrics.

In [None]:
def calculate_ci_for_proportions(conversions_A, n_A, conversions_B, n_B, confidence=0.95):
    """
    Calculate confidence interval for the difference in proportions (p_B - p_A).
    
    Args:
        conversions_A: Number of conversions in control group
        n_A: Total users in control group
        conversions_B: Number of conversions in treatment group
        n_B: Total users in treatment group
        confidence: Confidence level (default 0.95 for 95% CI)
    
    Returns:
        tuple: (difference, lower_bound, upper_bound)
    """
    p_A = conversions_A / n_A
    p_B = conversions_B / n_B
    diff = p_B - p_A
    
    # Standard error of the difference
    se_diff = np.sqrt((p_A * (1 - p_A) / n_A) + (p_B * (1 - p_B) / n_B))
    
    # Critical value for the confidence level
    alpha = 1 - confidence
    z_critical = stats.norm.ppf(1 - alpha/2)
    
    # Confidence interval
    margin_of_error = z_critical * se_diff
    lower = diff - margin_of_error
    upper = diff + margin_of_error
    
    return diff, lower, upper

def calculate_ci_for_means(data_A, data_B, confidence=0.95):
    """
    Calculate confidence interval for the difference in means (mean_B - mean_A).
    Uses Welch's approximation for unequal variances.
    
    Args:
        data_A: Array of values from control group
        data_B: Array of values from treatment group
        confidence: Confidence level (default 0.95 for 95% CI)
    
    Returns:
        tuple: (difference, lower_bound, upper_bound)
    """
    mean_A = np.mean(data_A)
    mean_B = np.mean(data_B)
    diff = mean_B - mean_A
    
    # Standard errors
    se_A = stats.sem(data_A)
    se_B = stats.sem(data_B)
    se_diff = np.sqrt(se_A**2 + se_B**2)
    
    # Degrees of freedom (Welch-Satterthwaite equation)
    var_A = np.var(data_A, ddof=1)
    var_B = np.var(data_B, ddof=1)
    n_A = len(data_A)
    n_B = len(data_B)
    
    df = (var_A/n_A + var_B/n_B)**2 / ((var_A/n_A)**2/(n_A-1) + (var_B/n_B)**2/(n_B-1))
    
    # Critical value from t-distribution
    alpha = 1 - confidence
    t_critical = stats.t.ppf(1 - alpha/2, df)
    
    # Confidence interval
    margin_of_error = t_critical * se_diff
    lower = diff - margin_of_error
    upper = diff + margin_of_error
    
    return diff, lower, upper

# Example 1: Proportions (Conversion Rate)
print("=== Confidence Interval for Conversion Rate Difference ===")
conversions_A = 1200  # Control
n_A = 10000
conversions_B = 1350  # Treatment
n_B = 10000

diff, lower, upper = calculate_ci_for_proportions(conversions_A, n_A, conversions_B, n_B)

print(f"Control conversion rate: {conversions_A/n_A:.2%}")
print(f"Treatment conversion rate: {conversions_B/n_B:.2%}")
print(f"Difference: {diff:.2%}")
print(f"95% CI for difference: [{lower:.2%}, {upper:.2%}]")

if lower > 0:
    print("Interpretation: We are 95% confident the treatment INCREASES conversion.")
elif upper < 0:
    print("Interpretation: We are 95% confident the treatment DECREASES conversion.")
else:
    print("Interpretation: CI includes zero - effect is not statistically significant.")

# Example 2: Continuous Metrics (Revenue)
print("\n=== Confidence Interval for Revenue Difference ===")
np.random.seed(42)
revenue_A = np.random.normal(25.50, 45.20, 5000)  # Control
revenue_B = np.random.normal(28.80, 47.10, 5000)  # Treatment

diff, lower, upper = calculate_ci_for_means(revenue_A, revenue_B)

print(f"Control mean revenue: ${np.mean(revenue_A):.2f}")
print(f"Treatment mean revenue: ${np.mean(revenue_B):.2f}")
print(f"Difference: ${diff:.2f}")
print(f"95% CI for difference: [${lower:.2f}, ${upper:.2f}]")
print(f"Relative lift: {diff/np.mean(revenue_A):.1%}")

if lower > 0:
    print("Interpretation: We are 95% confident the treatment INCREASES revenue.")
elif upper < 0:
    print("Interpretation: We are 95% confident the treatment DECREASES revenue.")
else:
    print("Interpretation: CI includes zero - effect is not statistically significant.")

## Section 3: Sample Size Calculation

This section demonstrates how to calculate required sample sizes for both proportion tests and continuous metric tests using `statsmodels.stats.power`.

In [None]:
# Example 1: Sample size for proportions (conversion rate test)
# Baseline conversion rate: 12%
# Minimum detectable effect: 1.5 percentage points (12% -> 13.5%)
# Significance level: 0.05 (two-sided)
# Power: 0.80

baseline_rate = 0.12
treatment_rate = 0.135
alpha = 0.05
power = 0.80

# Calculate effect size (Cohen's h for proportions)
effect_size = proportion_effectsize(baseline_rate, treatment_rate)

# Calculate required sample size per group
sample_size_prop = zt_ind_solve_power(
    effect_size=effect_size,
    alpha=alpha,
    power=power,
    ratio=1.0,  # Equal group sizes
    alternative='two-sided'
)

print("=== Sample Size for Proportions Test ===")
print(f"Baseline rate: {baseline_rate:.1%}")
print(f"Target rate: {treatment_rate:.1%}")
print(f"Minimum detectable effect: {(treatment_rate - baseline_rate):.1%}")
print(f"Effect size (Cohen's h): {effect_size:.4f}")
print(f"Required sample size per group: {int(np.ceil(sample_size_prop)):,}")
print(f"Total sample size: {int(np.ceil(sample_size_prop * 2)):,}\n")

# Example 2: Sample size for continuous metrics (ARPU test)
# Baseline mean: $25
# Minimum detectable effect: $3 (12% lift)
# Pooled standard deviation: $45
# Significance level: 0.05 (two-sided)
# Power: 0.80

baseline_mean = 25.0
mde = 3.0  # Minimum detectable effect
pooled_std = 45.0

# Calculate standardized effect size (Cohen's d)
cohens_d = mde / pooled_std

# Calculate required sample size per group
sample_size_cont = tt_ind_solve_power(
    effect_size=cohens_d,
    alpha=alpha,
    power=power,
    ratio=1.0,  # Equal group sizes
    alternative='two-sided'
)

print("=== Sample Size for Continuous Metrics Test ===")
print(f"Baseline mean: ${baseline_mean:.2f}")
print(f"Minimum detectable effect: ${mde:.2f}")
print(f"Pooled standard deviation: ${pooled_std:.2f}")
print(f"Effect size (Cohen's d): {cohens_d:.4f}")
print(f"Required sample size per group: {int(np.ceil(sample_size_cont)):,}")
print(f"Total sample size: {int(np.ceil(sample_size_cont * 2)):,}")

## Section 4.1: Z-test for Proportions

This example demonstrates how to test the difference in conversion rates between a control and treatment group using a two-sample Z-test for proportions.

In [None]:
# Example: Testing conversion rate between control and treatment
# Control group: 1,200 conversions out of 10,000 users (12% conversion)
# Treatment group: 1,350 conversions out of 10,000 users (13.5% conversion)

conversions = np.array([1200, 1350])  # Number of conversions
sample_sizes = np.array([10000, 10000])  # Total users in each group

# Perform two-sided Z-test
z_stat, p_value = proportions_ztest(conversions, sample_sizes, alternative='two-sided')

print(f"Z-statistic: {z_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Calculate 95% confidence interval for the difference
# (using individual CIs for each group as an approximation)
ci_control = proportion_confint(conversions[0], sample_sizes[0], alpha=0.05, method='normal')
ci_treatment = proportion_confint(conversions[1], sample_sizes[1], alpha=0.05, method='normal')

print(f"\nControl conversion rate: {conversions[0]/sample_sizes[0]:.4f}")
print(f"95% CI for control: [{ci_control[0]:.4f}, {ci_control[1]:.4f}]")
print(f"\nTreatment conversion rate: {conversions[1]/sample_sizes[1]:.4f}")
print(f"95% CI for treatment: [{ci_treatment[0]:.4f}, {ci_treatment[1]:.4f}]")

# Interpretation
if p_value < 0.05:
    print(f"\nResult: Statistically significant (p < 0.05)")
    print("We reject the null hypothesis. The treatment has a significant effect.")
else:
    print(f"\nResult: Not statistically significant (p >= 0.05)")
    print("We fail to reject the null hypothesis.")

## Section 4.2: Welch's t-test for Continuous Metrics

This example demonstrates how to test the difference in average revenue per user (ARPU) between control and treatment groups using Welch's t-test.

In [None]:
# Example: Testing average revenue per user (ARPU) between control and treatment
# Control group: Mean = $25.50, Std = $45.20, n = 5,000
# Treatment group: Mean = $28.80, Std = $47.10, n = 5,000

# Generate sample data (in practice, this comes from your experiment data)
np.random.seed(42)
control_revenue = np.random.normal(loc=25.50, scale=45.20, size=5000)
treatment_revenue = np.random.normal(loc=28.80, scale=47.10, size=5000)

# Perform Welch's t-test (equal_var=False)
t_stat, p_value = stats.ttest_ind(control_revenue, treatment_revenue, equal_var=False)

# Calculate means and standard errors
mean_control = np.mean(control_revenue)
mean_treatment = np.mean(treatment_revenue)
se_control = stats.sem(control_revenue)
se_treatment = stats.sem(treatment_revenue)

# Calculate 95% confidence intervals
ci_control = stats.t.interval(0.95, len(control_revenue)-1, 
                               loc=mean_control, 
                               scale=se_control)
ci_treatment = stats.t.interval(0.95, len(treatment_revenue)-1, 
                                 loc=mean_treatment, 
                                 scale=se_treatment)

print(f"Control group mean ARPU: ${mean_control:.2f}")
print(f"95% CI: [${ci_control[0]:.2f}, ${ci_control[1]:.2f}]")
print(f"\nTreatment group mean ARPU: ${mean_treatment:.2f}")
print(f"95% CI: [${ci_treatment[0]:.2f}, ${ci_treatment[1]:.2f}]")
print(f"\nt-statistic: {t_stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Calculate effect size (lift)
lift = (mean_treatment - mean_control) / mean_control * 100
print(f"\nLift: {lift:.2f}%")

# Interpretation
if p_value < 0.05:
    print(f"\nResult: Statistically significant (p < 0.05)")
    print("We reject the null hypothesis. The treatment has a significant effect on ARPU.")
else:
    print(f"\nResult: Not statistically significant (p >= 0.05)")
    print("We fail to reject the null hypothesis.")