In [None]:
# Install required dependencies
!pip install scipy pandas numpy -q

# Chapter 9 Code Verification Notebook

**Purpose**: Verify all code examples from `ch9_data_quality_and_health_checks.md` execute correctly and produce expected outputs.

**Code Blocks to Verify**:
1. SRM Detection Function (Section 1.2)
2. Invariant Checking Function (Section 3.3)

**Requirements**:
- scipy >= 1.7.0
- pandas >= 1.3.0
- numpy >= 1.21.0

## Setup: Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare, chi2_contingency

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

## Code Block 1: SRM Detection Function

**Source**: Section 1.2 - Implementation in the Data Pipeline  
**Lines**: ~103-147

This function checks for Sample Ratio Mismatch using Chi-Squared goodness-of-fit test.

In [None]:
# Code Block 1: SRM Detection Function (from ch9 Section 1.2)

def check_for_srm(observed_counts: list, expected_ratios: list, alpha: float = 0.001) -> (bool, float):
    """
    Performs a Chi-Squared Goodness of Fit test to detect Sample Ratio Mismatch.

    Args:
        observed_counts: A list of the observed user counts for each variant.
                         Example: [49400, 50600]
        expected_ratios: A list of the expected traffic ratios for each variant.
                         Example: [0.5, 0.5]
        alpha: The significance level for the test. A stricter alpha (like 0.001)
               is recommended for health checks to reduce false positives.

    Returns:
        A tuple containing:
        - A boolean indicating if SRM is detected.
        - The calculated p-value.
    """
    if sum(expected_ratios) != 1.0:
        raise ValueError("Expected ratios must sum to 1.0")
    if len(observed_counts) != len(expected_ratios):
        raise ValueError("Observed counts and expected ratios must have the same length.")

    total_observed = sum(observed_counts)
    expected_counts = [total_observed * ratio for ratio in expected_ratios]

    # The chisquare function returns the test statistic and the p-value.
    chi2_stat, p_value = chisquare(f_obs=observed_counts, f_exp=expected_counts)

    is_srm_detected = p_value < alpha
    
    return is_srm_detected, p_value

print("✓ SRM detection function defined successfully")

### Test Case 1.1: Example from Chapter (SRM Detected)

**Scenario**: The exact example from Section 1.1 of the chapter:
- 50/50 split expected
- Observed: 49,400 control, 50,600 treatment
- Expected result: SRM detected (p-value ≈ 0.00015)

In [None]:
# Test Case 1.1: Chapter example (SRM should be detected)
observed = [49400, 50600]
expected = [0.5, 0.5]

srm_detected, p_val = check_for_srm(observed, expected)

print("Test Case 1.1: Chapter Example (Expected: SRM Detected)")
print(f"  Observed counts: {observed}")
print(f"  Expected ratio: {expected}")
print(f"  SRM Detected: {srm_detected}")
print(f"  P-value: {p_val:.6f}")
print(f"  Chi-squared statistic: {(observed[0] - 50000)**2 / 50000 + (observed[1] - 50000)**2 / 50000:.2f}")

# Validation
assert srm_detected == True, "SRM should be detected for this example"
assert p_val < 0.001, f"P-value should be < 0.001, got {p_val}"
assert abs(p_val - 0.00015) < 0.0001, f"P-value should be ~0.00015, got {p_val}"
print("✓ Test Case 1.1 PASSED\n")

### Test Case 1.2: No SRM (Balanced Split)

**Scenario**: Perfectly balanced 50/50 split
- Expected result: No SRM detected (p-value = 1.0)

In [None]:
# Test Case 1.2: Perfect balance (no SRM)
observed = [50000, 50000]
expected = [0.5, 0.5]

srm_detected, p_val = check_for_srm(observed, expected)

print("Test Case 1.2: Perfect Balance (Expected: No SRM)")
print(f"  Observed counts: {observed}")
print(f"  Expected ratio: {expected}")
print(f"  SRM Detected: {srm_detected}")
print(f"  P-value: {p_val:.6f}")

# Validation
assert srm_detected == False, "SRM should NOT be detected for perfect balance"
assert p_val == 1.0, f"P-value should be 1.0 for perfect balance, got {p_val}"
print("✓ Test Case 1.2 PASSED\n")

### Test Case 1.3: Multi-Variant (3-way split)

**Scenario**: Three-way split (33.3/33.3/33.4) with slight imbalance
- Expected: 33,333 / 33,333 / 33,334
- Observed: 33,100 / 33,500 / 33,400

In [None]:
# Test Case 1.3: Three-way split
observed = [33100, 33500, 33400]
expected = [0.333333, 0.333333, 0.333334]

srm_detected, p_val = check_for_srm(observed, expected)

print("Test Case 1.3: Three-Way Split (Expected: Check behavior)")
print(f"  Observed counts: {observed}")
print(f"  Expected ratio: {expected}")
print(f"  SRM Detected: {srm_detected}")
print(f"  P-value: {p_val:.6f}")

# Validation (no strict assertion, just verify it runs)
print(f"✓ Test Case 1.3 PASSED (p-value: {p_val:.6f})\n")

### Test Case 1.4: Unequal Split (70/30)

**Scenario**: Intentional 70/30 split with correct execution

In [None]:
# Test Case 1.4: Unequal split (70/30)
observed = [70050, 29950]  # Very close to expected
expected = [0.7, 0.3]

srm_detected, p_val = check_for_srm(observed, expected)

print("Test Case 1.4: Unequal Split 70/30 (Expected: No SRM)")
print(f"  Observed counts: {observed}")
print(f"  Expected ratio: {expected}")
print(f"  SRM Detected: {srm_detected}")
print(f"  P-value: {p_val:.6f}")

# Validation
assert srm_detected == False, "SRM should NOT be detected for properly executed 70/30 split"
print("✓ Test Case 1.4 PASSED\n")

### Test Case 1.5: Error Handling

**Scenario**: Test that function properly handles invalid inputs

In [None]:
# Test Case 1.5: Error handling
print("Test Case 1.5: Error Handling")

# Test 1.5a: Ratios don't sum to 1.0
try:
    check_for_srm([50000, 50000], [0.4, 0.5])  # Sum = 0.9
    print("  ✗ Should have raised ValueError for ratios not summing to 1.0")
except ValueError as e:
    print(f"  ✓ Correctly raised ValueError: {e}")

# Test 1.5b: Mismatched lengths
try:
    check_for_srm([50000, 50000], [0.5, 0.3, 0.2])  # Different lengths
    print("  ✗ Should have raised ValueError for mismatched lengths")
except ValueError as e:
    print(f"  ✓ Correctly raised ValueError: {e}")

print("✓ Test Case 1.5 PASSED\n")

## Code Block 2: Invariant Checking Function

**Source**: Section 3.3 - Implementation in the Data Pipeline  
**Lines**: ~275-301

This function checks if invariant attributes are balanced across experiment groups using Chi-Squared test of independence.

In [None]:
# Code Block 2: Invariant Checking Function (from ch9 Section 3.3)

def check_invariants(df: pd.DataFrame, invariants: list, alpha: float = 0.001):
    """
    Checks for imbalances in invariant distributions across experiment groups.

    Args:
        df: A DataFrame with one row per user, containing columns for 'variant'
            and all invariant dimensions.
        invariants: A list of column names to check as invariants.
        alpha: The significance level for the test.
    """
    results = {}
    for invariant_col in invariants:
        # Create a contingency table (e.g., rows are device types, columns are variants)
        contingency_table = pd.crosstab(df[invariant_col], df['variant'])
        
        # Perform the Chi-Squared test of independence
        chi2_stat, p_value, _, _ = chi2_contingency(contingency_table)
        
        is_imbalanced = p_value < alpha
        results[invariant_col] = {"imbalanced": is_imbalanced, "p_value": p_value}
        
        if is_imbalanced:
            print(f"IMBALANCE DETECTED in '{invariant_col}'! p-value: {p_value:.6f}")
        else:
            print(f"'{invariant_col}' check passed. p-value: {p_value:.6f}")
    
    return results

print("✓ Invariant checking function defined successfully")

### Test Case 2.1: Chapter Example (Balanced Device Types)

**Scenario**: The exact contingency table from Section 3.2:
- Desktop: 40,100 control, 39,900 treatment
- Mobile: 9,900 control, 10,100 treatment
- Expected: No imbalance detected (very minor differences)

In [None]:
# Test Case 2.1: Chapter example - balanced device types
print("Test Case 2.1: Chapter Example (Expected: No Imbalance)")

# Create synthetic data matching the chapter's contingency table
data_2_1 = pd.DataFrame({
    'variant': ['control'] * 40100 + ['treatment'] * 39900 + ['control'] * 9900 + ['treatment'] * 10100,
    'device_type': ['desktop'] * 40100 + ['desktop'] * 39900 + ['mobile'] * 9900 + ['mobile'] * 10100
})

print(f"  Total users: {len(data_2_1):,}")
print("  Contingency table:")
ct = pd.crosstab(data_2_1['device_type'], data_2_1['variant'])
print(ct)

results = check_invariants(data_2_1, ['device_type'], alpha=0.001)

# Validation
assert results['device_type']['imbalanced'] == False, "Device type should be balanced in chapter example"
print(f"\n✓ Test Case 2.1 PASSED (p-value: {results['device_type']['p_value']:.6f})\n")

### Test Case 2.2: Severely Imbalanced Invariant

**Scenario**: Clear imbalance - treatment has disproportionately more mobile users
- Expected: Imbalance detected

In [None]:
# Test Case 2.2: Severely imbalanced device distribution
print("Test Case 2.2: Severe Imbalance (Expected: Imbalance Detected)")

# Create data where treatment has much higher mobile proportion
data_2_2 = pd.DataFrame({
    'variant': ['control'] * 45000 + ['treatment'] * 30000 + ['control'] * 5000 + ['treatment'] * 20000,
    'device_type': ['desktop'] * 45000 + ['desktop'] * 30000 + ['mobile'] * 5000 + ['mobile'] * 20000
})

print(f"  Total users: {len(data_2_2):,}")
print("  Contingency table:")
ct = pd.crosstab(data_2_2['device_type'], data_2_2['variant'])
print(ct)
print("  (Treatment has 40% mobile vs Control 10% mobile - clear imbalance)\n")

results = check_invariants(data_2_2, ['device_type'], alpha=0.001)

# Validation
assert results['device_type']['imbalanced'] == True, "Device type should be imbalanced in this scenario"
assert results['device_type']['p_value'] < 0.001, "P-value should be very small"
print(f"\n✓ Test Case 2.2 PASSED (p-value: {results['device_type']['p_value']:.10f})\n")

### Test Case 2.3: Multiple Invariants

**Scenario**: Check multiple invariants simultaneously (device, browser, country)
- Mix of balanced and imbalanced invariants

In [None]:
# Test Case 2.3: Multiple invariants
print("Test Case 2.3: Multiple Invariants")

np.random.seed(42)  # For reproducibility

# Create synthetic experiment data
n_control = 50000
n_treatment = 50000

data_2_3 = pd.DataFrame({
    'variant': ['control'] * n_control + ['treatment'] * n_treatment,
    'device_type': (
        np.random.choice(['desktop', 'mobile', 'tablet'], n_control, p=[0.7, 0.25, 0.05]).tolist() +
        np.random.choice(['desktop', 'mobile', 'tablet'], n_treatment, p=[0.7, 0.25, 0.05]).tolist()
    ),
    'browser': (
        np.random.choice(['chrome', 'firefox', 'safari'], n_control, p=[0.6, 0.25, 0.15]).tolist() +
        np.random.choice(['chrome', 'firefox', 'safari'], n_treatment, p=[0.6, 0.25, 0.15]).tolist()
    ),
    'country': (
        np.random.choice(['US', 'UK', 'CA'], n_control, p=[0.6, 0.25, 0.15]).tolist() +
        # Intentionally imbalance country - treatment has more UK users
        np.random.choice(['US', 'UK', 'CA'], n_treatment, p=[0.5, 0.35, 0.15]).tolist()
    )
})

print(f"  Total users: {len(data_2_3):,}")
print("\n  Checking 3 invariants: device_type, browser, country\n")

results = check_invariants(data_2_3, ['device_type', 'browser', 'country'], alpha=0.001)

# Validation
print("\n  Summary:")
for inv, res in results.items():
    status = "IMBALANCED" if res['imbalanced'] else "BALANCED"
    print(f"    {inv}: {status} (p={res['p_value']:.6f})")

assert results['country']['imbalanced'] == True, "Country should be imbalanced (intentional bias)"
print("\n✓ Test Case 2.3 PASSED\n")

### Test Case 2.4: Perfect Balance (All Categories Equal)

**Scenario**: Perfectly balanced data across all categories

In [None]:
# Test Case 2.4: Perfect balance
print("Test Case 2.4: Perfect Balance (Expected: No Imbalance)")

# Create perfectly balanced data
data_2_4 = pd.DataFrame({
    'variant': ['control'] * 40000 + ['treatment'] * 40000 + ['control'] * 10000 + ['treatment'] * 10000,
    'device_type': ['desktop'] * 40000 + ['desktop'] * 40000 + ['mobile'] * 10000 + ['mobile'] * 10000
})

print(f"  Total users: {len(data_2_4):,}")
print("  Contingency table:")
ct = pd.crosstab(data_2_4['device_type'], data_2_4['variant'])
print(ct)

results = check_invariants(data_2_4, ['device_type'], alpha=0.001)

# Validation
assert results['device_type']['imbalanced'] == False, "Should be perfectly balanced"
assert results['device_type']['p_value'] == 1.0, "P-value should be 1.0 for perfect balance"
print(f"\n✓ Test Case 2.4 PASSED (p-value: {results['device_type']['p_value']:.6f})\n")