In [None]:
import numpy as np
from scipy import stats

def calculate_sample_size(p1, p2, power=0.8, alpha=0.05):
    """
    Calculates required sample size to detect differences between two proportions
    p1 (accuracy of the first classification method)
    p2 (accuracy of the second classification method)
    power (statistical power 1 - probability of Type II error)
    alpha (significance level / probability of Type I error)

    n (output: required sample size for each group)
    """
    # critical values for given alpha and power
    z_alpha = stats.norm.ppf(1 - alpha/2)
    z_beta = stats.norm.ppf(power)

    # pool SE components
    se1 = p1 * (1 - p1)
    se2 = p2 * (1 - p2)

    # calculate sample size
    n = ((z_alpha + z_beta)**2 * (se1 + se2)) / (p1 - p2)**2

    return int(np.ceil(n))

def min_detectable_difference(n, p1, power=0.8, alpha=0.05):
    z_alpha = stats.norm.ppf(1 - alpha/2)
    z_beta = stats.norm.ppf(power)

    """ need to solve for p2 using an approximation
    formula: p2 = p1 ± sqrt((z_alpha + z_beta)^2 * (2*p1*(1-p1)) / n)
    approximation assumes p1 and p2 aren't too different"""

    # SE under null hypothesis
    se_null = np.sqrt(2 * p1 * (1 - p1) / n)

    # then min detectable difference
    diff = (z_alpha + z_beta) * se_null

    return diff

def calculate_power(p1, p2, n, alpha=0.05):
    z_alpha = stats.norm.ppf(1 - alpha/2)

    # SE of difference
    se = np.sqrt((p1 * (1 - p1) + p2 * (1 - p2)) / n)

    # effect size
    delta = abs(p2 - p1)

    # calculate test power
    z_score = delta / se - z_alpha
    power = stats.norm.cdf(z_score)

    return power

# from Jabbar paper: comparing accuracy of 87% vs 92%
p1 = 0.87  # previous best accuracy
p2 = 0.92  # claimed new accuracy in the paper
n = calculate_sample_size(p1, p2)
print(f"Required sample size for each group: {n}")

actual_n = 96  # sample size in Hyderabad dataset in paper
mdd = min_detectable_difference(actual_n, p1)
power_5pct = calculate_power(p1, p2, actual_n)

print(f"Minimum detectable difference with n={actual_n}: {mdd:.4f}")
print(f"Can only reliably detect a difference larger than {mdd*100:.1f}% in accuracy")

print(f"Power to detect a 5% difference with n={actual_n}: {power_5pct:.4f}")
print(f"So, only a {power_5pct*100:.1f}% chance of detecting a true 5% improvement")

Required sample size for each group: 587
Minimum detectable difference with n=96: 0.1360
Can only reliably detect a difference larger than 13.6% in accuracy
Power to detect a 5% difference with n=96: 0.2044
So only a 20.4% chance of detecting a true 5% improvement


This is a quick notebook that I put together for my critique of a paper that was covered in a machine learning class titled "Alternating decision trees for early diagnosis of heart disease" (Jabbar, 2014) to examine sample size and power of the claims presented in it.