# Confidence Intervals & Hypothesis Testing

**Module: Descriptive & Inferential Statistics**

## Learning Objectives
- Construct and interpret confidence intervals
- Set up and conduct hypothesis tests
- Interpret p-values and make decisions
- Calculate and interpret effect sizes

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

np.random.seed(42)

---
## Quick Refresher

### Confidence Intervals
A range of values likely to contain the true population parameter.

**For a mean (when σ unknown):**
$$\bar{x} \pm t_{\alpha/2} \cdot \frac{s}{\sqrt{n}}$$

- 95% CI: We're 95% confident the true mean lies in this interval
- Wider CI = more uncertainty, narrower = more precision

### Hypothesis Testing
| Term | Meaning |
|------|--------|
| **H₀ (null)** | No effect/difference (status quo) |
| **H₁ (alternative)** | There is an effect/difference |
| **p-value** | Probability of seeing results this extreme if H₀ is true |
| **α (alpha)** | Significance level, typically 0.05 |
| **Decision** | If p < α, reject H₀ |

### Common Tests
| Scenario | Test |
|----------|------|
| One sample mean vs. known value | One-sample t-test |
| Two independent group means | Two-sample t-test |
| Same group, before/after | Paired t-test |
| Proportions | z-test for proportions |

---
## Working Example: Confidence Intervals

In [None]:
# Sample of customer satisfaction scores
satisfaction = np.array([4.2, 3.8, 4.5, 4.1, 3.9, 4.3, 4.0, 3.7, 4.4, 4.2,
                         3.6, 4.1, 4.3, 3.9, 4.0, 4.2, 3.8, 4.1, 4.4, 3.9])

n = len(satisfaction)
mean = satisfaction.mean()
std = satisfaction.std(ddof=1)  # Sample std dev
se = std / np.sqrt(n)  # Standard error

print(f"Sample size: {n}")
print(f"Sample mean: {mean:.3f}")
print(f"Sample std: {std:.3f}")
print(f"Standard error: {se:.3f}")

In [None]:
# 95% Confidence Interval using t-distribution
confidence = 0.95
alpha = 1 - confidence
df = n - 1  # degrees of freedom

# t critical value for 95% CI
t_crit = stats.t.ppf(1 - alpha/2, df)
print(f"t critical value: {t_crit:.3f}")

# Margin of error
margin = t_crit * se

# CI bounds
ci_lower = mean - margin
ci_upper = mean + margin

print(f"\n95% Confidence Interval: ({ci_lower:.3f}, {ci_upper:.3f})")

In [None]:
# Easier way using scipy.stats
ci = stats.t.interval(confidence=0.95, df=n-1, loc=mean, scale=se)
print(f"95% CI (scipy): ({ci[0]:.3f}, {ci[1]:.3f})")

# Or using sem (standard error of mean)
sem = stats.sem(satisfaction)
ci_alt = stats.t.interval(0.95, df=n-1, loc=mean, scale=sem)
print(f"95% CI (using sem): ({ci_alt[0]:.3f}, {ci_alt[1]:.3f})")

---
## Working Example: Hypothesis Testing

In [None]:
# One-sample t-test
# H₀: μ = 4.0 (target satisfaction is 4.0)
# H₁: μ ≠ 4.0

target = 4.0
t_stat, p_value = stats.ttest_1samp(satisfaction, target)

print(f"H₀: Population mean = {target}")
print(f"H₁: Population mean ≠ {target}")
print(f"\nt-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"\nConclusion at α=0.05: {'Reject H₀' if p_value < 0.05 else 'Fail to reject H₀'}")

In [None]:
# Two-sample t-test
# Compare satisfaction between two store locations

store_a = np.array([4.2, 4.5, 3.9, 4.1, 4.3, 4.0, 4.4, 3.8, 4.2, 4.1,
                    4.3, 3.9, 4.2, 4.0, 4.1])
store_b = np.array([3.8, 3.5, 4.0, 3.7, 3.9, 3.6, 3.8, 4.1, 3.7, 3.5,
                    3.9, 3.6, 3.8, 3.7, 4.0])

print(f"Store A: mean = {store_a.mean():.3f}, n = {len(store_a)}")
print(f"Store B: mean = {store_b.mean():.3f}, n = {len(store_b)}")

# Two-sample independent t-test
t_stat, p_value = stats.ttest_ind(store_a, store_b)

print(f"\nt-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"\nConclusion: {'Significant difference' if p_value < 0.05 else 'No significant difference'}")

In [None]:
# Paired t-test (before/after)
# Productivity scores before and after training

before = np.array([72, 65, 80, 68, 75, 70, 78, 62, 74, 69])
after = np.array([78, 70, 82, 75, 80, 74, 81, 68, 79, 75])

print(f"Before: mean = {before.mean():.1f}")
print(f"After: mean = {after.mean():.1f}")
print(f"Average improvement: {(after - before).mean():.1f}")

t_stat, p_value = stats.ttest_rel(after, before)

print(f"\nt-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"\nConclusion: {'Training had significant effect' if p_value < 0.05 else 'No significant effect'}")

### Effect Size: Cohen's d

In [None]:
def cohens_d(group1, group2):
    """Calculate Cohen's d for two independent groups."""
    n1, n2 = len(group1), len(group2)
    var1, var2 = group1.var(ddof=1), group2.var(ddof=1)
    
    # Pooled standard deviation
    pooled_std = np.sqrt(((n1-1)*var1 + (n2-1)*var2) / (n1+n2-2))
    
    return (group1.mean() - group2.mean()) / pooled_std

d = cohens_d(store_a, store_b)
print(f"Cohen's d: {d:.3f}")
print(f"\nInterpretation:")
print(f"  |d| < 0.2: negligible")
print(f"  |d| ~ 0.2: small")
print(f"  |d| ~ 0.5: medium")
print(f"  |d| ~ 0.8+: large")
print(f"\nThis effect is: {'large' if abs(d) >= 0.8 else 'medium' if abs(d) >= 0.5 else 'small' if abs(d) >= 0.2 else 'negligible'}")

---
## Exercises

### Exercise 1: Confidence Intervals

In [None]:
# Customer wait times (in minutes) at a service center
wait_times = np.array([5.2, 7.1, 4.8, 6.3, 8.2, 5.5, 6.8, 4.2, 7.5, 6.1,
                       5.8, 7.3, 6.5, 4.9, 5.7, 8.0, 6.2, 5.4, 7.0, 6.6,
                       5.3, 6.9, 7.4, 5.1, 6.4])

# TODO: Calculate the 95% confidence interval for mean wait time



In [None]:
# TODO: Calculate a 90% confidence interval. Is it wider or narrower than 95%?



In [None]:
# TODO: The target is to keep average wait time under 6 minutes.
# Based on the 95% CI, can you confidently say you're meeting this target?



### Exercise 2: One-Sample Tests

In [None]:
# A manufacturer claims their batteries last 500 hours on average.
# You test 30 batteries and get these results:

battery_life = np.array([485, 512, 478, 495, 520, 488, 502, 475, 498, 510,
                         492, 505, 480, 515, 490, 508, 483, 500, 495, 518,
                         487, 503, 477, 512, 493, 506, 481, 499, 515, 489])

# TODO: Test the manufacturer's claim at α = 0.05
# H₀: μ = 500
# H₁: μ ≠ 500 (two-tailed)



In [None]:
# TODO: Now test if batteries last LESS than 500 hours (one-tailed)
# H₀: μ ≥ 500
# H₁: μ < 500



### Exercise 3: Two-Sample Comparison

In [None]:
# Two different training programs for sales team
# Performance scores after completing each program

program_a = np.array([82, 78, 85, 80, 83, 79, 86, 81, 77, 84,
                      80, 82, 78, 85, 81, 83, 79, 86, 82, 80])

program_b = np.array([88, 85, 90, 87, 92, 86, 89, 84, 91, 88,
                      87, 90, 85, 93, 89, 86, 91, 88, 87, 90])

# TODO: Test if there's a significant difference between programs at α = 0.05



In [None]:
# TODO: Calculate Cohen's d effect size. How meaningful is the difference?



In [None]:
# TODO: Calculate 95% CI for the difference in means
# Hint: You can use stats.ttest_ind with equal_var=True to get a combined view,
# or calculate: (mean_diff) ± t_crit * SE_diff



### Exercise 4: Paired Comparison

In [None]:
# Website page load times (seconds) before and after optimization
# Same 15 pages measured both times

pages = pd.DataFrame({
    'page_id': range(1, 16),
    'before': [3.2, 4.1, 2.8, 5.2, 3.8, 4.5, 3.1, 4.8, 3.5, 4.2,
               3.9, 2.9, 4.6, 3.3, 4.0],
    'after': [2.8, 3.5, 2.5, 4.1, 3.2, 3.8, 2.7, 4.0, 3.0, 3.5,
              3.3, 2.5, 3.9, 2.9, 3.4]
})

pages.head()

In [None]:
# TODO: Calculate the improvement for each page
# What's the mean improvement?



In [None]:
# TODO: Perform a paired t-test to see if the optimization significantly reduced load times
# H₀: μ_diff = 0 (no change)
# H₁: μ_diff > 0 (load times decreased, meaning before > after)



In [None]:
# TODO: Calculate 95% CI for the mean improvement



### Exercise 5: Proportion Test

In [None]:
# Email campaign results
# Historical click-through rate: 2.5%
# New campaign: 58 clicks out of 2000 emails sent

clicks = 58
total = 2000
historical_rate = 0.025

# TODO: Calculate the sample proportion



In [None]:
# TODO: Test if the new campaign has a different CTR than historical
# Use a z-test for proportion
# z = (p_hat - p0) / sqrt(p0 * (1-p0) / n)



In [None]:
# TODO: Calculate 95% CI for the true click-through rate
# CI = p_hat ± z_crit * sqrt(p_hat * (1 - p_hat) / n)



---
## Solutions

In [None]:
# Exercise 1 Solutions

n = len(wait_times)
mean = wait_times.mean()
sem = stats.sem(wait_times)

# 95% CI
ci_95 = stats.t.interval(0.95, df=n-1, loc=mean, scale=sem)
print(f"95% CI: ({ci_95[0]:.3f}, {ci_95[1]:.3f})")

# 90% CI
ci_90 = stats.t.interval(0.90, df=n-1, loc=mean, scale=sem)
print(f"90% CI: ({ci_90[0]:.3f}, {ci_90[1]:.3f})")
print("90% CI is narrower (less confidence = narrower interval)")

# Target assessment
print(f"\nSample mean: {mean:.3f}")
print(f"95% CI lower bound: {ci_95[0]:.3f}")
if ci_95[1] < 6:
    print("Yes, can confidently say average wait time is under 6 minutes")
else:
    print("No, cannot confidently say average is under 6 minutes (CI includes values ≥ 6)")

In [None]:
# Exercise 2 Solutions

# Two-tailed test
t_stat, p_value = stats.ttest_1samp(battery_life, 500)
print(f"Two-tailed test:")
print(f"Sample mean: {battery_life.mean():.2f}")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"Conclusion: {'Reject H₀' if p_value < 0.05 else 'Fail to reject H₀'}")

# One-tailed test (less than)
p_one_tail = p_value / 2 if t_stat < 0 else 1 - p_value / 2
print(f"\nOne-tailed test (μ < 500):")
print(f"p-value: {p_one_tail:.4f}")
print(f"Conclusion: {'Evidence batteries last less than 500 hours' if p_one_tail < 0.05 else 'No evidence batteries last less than 500 hours'}")

In [None]:
# Exercise 3 Solutions

# Two-sample t-test
t_stat, p_value = stats.ttest_ind(program_a, program_b)
print(f"Program A mean: {program_a.mean():.2f}")
print(f"Program B mean: {program_b.mean():.2f}")
print(f"t-statistic: {t_stat:.3f}")
print(f"p-value: {p_value:.6f}")
print(f"Conclusion: {'Significant difference' if p_value < 0.05 else 'No significant difference'}")

# Cohen's d
d = cohens_d(program_b, program_a)
print(f"\nCohen's d: {d:.3f} (large effect)")

In [None]:
# CI for difference in means
mean_diff = program_b.mean() - program_a.mean()
n1, n2 = len(program_a), len(program_b)
se_diff = np.sqrt(program_a.var(ddof=1)/n1 + program_b.var(ddof=1)/n2)
df = n1 + n2 - 2
t_crit = stats.t.ppf(0.975, df)

ci_diff = (mean_diff - t_crit * se_diff, mean_diff + t_crit * se_diff)
print(f"95% CI for difference: ({ci_diff[0]:.2f}, {ci_diff[1]:.2f})")

In [None]:
# Exercise 4 Solutions

pages['improvement'] = pages['before'] - pages['after']
print(f"Mean improvement: {pages['improvement'].mean():.3f} seconds")

# Paired t-test
t_stat, p_value = stats.ttest_rel(pages['before'], pages['after'])
# For one-tailed (before > after), divide p by 2
p_one_tail = p_value / 2

print(f"\nt-statistic: {t_stat:.3f}")
print(f"p-value (one-tailed): {p_one_tail:.6f}")
print(f"Conclusion: {'Optimization significantly reduced load times' if p_one_tail < 0.05 else 'No significant reduction'}")

# CI for improvement
ci_imp = stats.t.interval(0.95, df=len(pages)-1, 
                          loc=pages['improvement'].mean(), 
                          scale=stats.sem(pages['improvement']))
print(f"\n95% CI for mean improvement: ({ci_imp[0]:.3f}, {ci_imp[1]:.3f}) seconds")

In [None]:
# Exercise 5 Solutions

# Sample proportion
p_hat = clicks / total
print(f"Sample proportion: {p_hat:.4f} ({p_hat*100:.2f}%)")

# Z-test for proportion
z_stat = (p_hat - historical_rate) / np.sqrt(historical_rate * (1 - historical_rate) / total)
p_value = 2 * (1 - stats.norm.cdf(abs(z_stat)))  # Two-tailed

print(f"\nZ-statistic: {z_stat:.3f}")
print(f"p-value: {p_value:.4f}")
print(f"Conclusion: {'CTR is different from historical' if p_value < 0.05 else 'No significant difference from historical'}")

# CI for proportion
z_crit = stats.norm.ppf(0.975)
margin = z_crit * np.sqrt(p_hat * (1 - p_hat) / total)
ci_prop = (p_hat - margin, p_hat + margin)
print(f"\n95% CI for CTR: ({ci_prop[0]*100:.2f}%, {ci_prop[1]*100:.2f}%)")